From e215a1e27d84adad2635a52393621eb4fa439dc9 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 8 Nov 2024 13:05:35 -0500 Subject: [PATCH] [AMDGPU] Still set up the two SGPRs for queue ptr even it is COV5 (#112403) --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 4 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 +- .../CodeGen/AMDGPU/GlobalISel/addsubu64.ll | 44 +- .../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 284 +- .../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 284 +- .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 450 +- .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 480 +- .../AMDGPU/GlobalISel/bool-legalization.ll | 8 +- .../GlobalISel/buffer-load-store-pointers.ll | 140 +- .../GlobalISel/call-outgoing-stack-args.ll | 20 +- .../GlobalISel/crash-stack-address-O0.ll | 2 +- .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll | 88 +- .../GlobalISel/divergent-control-flow.ll | 2 +- .../GlobalISel/dropped_debug_info_assert.ll | 57 +- .../GlobalISel/dynamic-alloca-uniform.ll | 30 +- .../AMDGPU/GlobalISel/extractelement.ll | 136 +- .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 316 +- .../fmamix-constant-bus-violation.ll | 18 +- .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 4 +- .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 692 +- llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll | 628 +- .../AMDGPU/GlobalISel/function-returns.ll | 4 +- ...licit-kernarg-backend-usage-global-isel.ll | 43 +- .../GlobalISel/inline-asm-mismatched-size.ll | 8 +- .../GlobalISel/insertelement-stack-lower.ll | 6 +- .../AMDGPU/GlobalISel/insertelement.large.ll | 6 +- .../GlobalISel/irtranslator-amdgpu_kernel.ll | 472 +- .../GlobalISel/irtranslator-assert-align.ll | 16 +- .../GlobalISel/irtranslator-atomicrmw.ll | 16 +- .../irtranslator-call-abi-attribute-hints.ll | 108 +- .../irtranslator-call-implicit-args.ll | 952 +- .../irtranslator-call-return-values.ll | 3205 ++-- .../GlobalISel/irtranslator-call-sret.ll | 71 +- .../AMDGPU/GlobalISel/irtranslator-call.ll | 4248 +++--- .../irtranslator-constant-fold-vector-op.ll | 4 +- .../AMDGPU/GlobalISel/irtranslator-fence.ll | 160 +- .../GlobalISel/irtranslator-indirect-call.ll | 63 +- .../GlobalISel/irtranslator-inline-asm.ll | 102 +- .../GlobalISel/irtranslator-tail-call.ll | 4 +- .../AMDGPU/GlobalISel/lds-global-value.ll | 2 +- .../AMDGPU/GlobalISel/lds-zero-initializer.ll | 4 +- .../GlobalISel/llvm.amdgcn.div.scale.ll | 764 +- .../GlobalISel/llvm.amdgcn.end.cf.i32.ll | 8 +- .../GlobalISel/llvm.amdgcn.end.cf.i64.ll | 4 +- .../llvm.amdgcn.global.atomic.csub.ll | 24 +- .../GlobalISel/llvm.amdgcn.if.break.i32.ll | 8 +- .../GlobalISel/llvm.amdgcn.if.break.i64.ll | 4 +- .../GlobalISel/llvm.amdgcn.intersect_ray.ll | 227 +- .../GlobalISel/llvm.amdgcn.mfma.gfx90a.ll | 102 +- .../AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll | 30 +- .../AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll | 98 +- .../GlobalISel/llvm.amdgcn.set.inactive.ll | 86 +- .../GlobalISel/llvm.amdgcn.trig.preop.ll | 30 +- .../AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll | 132 +- .../GlobalISel/llvm.amdgcn.update.dpp.ll | 98 +- .../CodeGen/AMDGPU/GlobalISel/localizer.ll | 4 +- .../madmix-constant-bus-violation.ll | 18 +- .../AMDGPU/GlobalISel/mul-known-bits.i64.ll | 222 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 122 +- .../AMDGPU/GlobalISel/non-entry-alloca.ll | 18 +- .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 1592 +- .../AMDGPU/GlobalISel/shl-ext-reduce.ll | 20 +- llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll | 4 +- .../AMDGPU/GlobalISel/store-local.128.ll | 488 +- .../AMDGPU/GlobalISel/store-local.96.ll | 460 +- .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 837 +- .../AMDGPU/GlobalISel/vni8-across-blocks.ll | 250 +- .../GlobalISel/widen-i8-i16-scalar-loads.ll | 188 +- llvm/test/CodeGen/AMDGPU/add.ll | 791 +- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 420 +- .../AMDGPU/agpr-copy-no-free-registers.ll | 106 +- llvm/test/CodeGen/AMDGPU/always-uniform.ll | 2 +- llvm/test/CodeGen/AMDGPU/amd.endpgm.ll | 20 +- ...amdgpu-codegenprepare-fold-binop-select.ll | 2 +- .../AMDGPU/amdgpu-codegenprepare-idiv.ll | 5705 ++++--- .../AMDGPU/amdgpu-demote-scc-branches.ll | 258 +- .../CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll | 2 +- .../amdgpu-simplify-libcall-pow-codegen.ll | 60 +- .../amdgpu.work-item-intrinsics.deprecated.ll | 36 +- .../amdhsa-kernarg-preload-num-sgprs.ll | 10 +- .../AMDGPU/amdhsa-kernarg-preload-num-sgprs.o | Bin 0 -> 11280 bytes llvm/test/CodeGen/AMDGPU/amdpal-elf.ll | 2 +- llvm/test/CodeGen/AMDGPU/andorbitset.ll | 36 +- llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll | 36 +- llvm/test/CodeGen/AMDGPU/anyext.ll | 64 +- .../AMDGPU/atomic_optimizations_buffer.ll | 1227 +- .../atomic_optimizations_global_pointer.ll | 3645 +++-- .../atomic_optimizations_local_pointer.ll | 5839 ++++--- .../AMDGPU/atomic_optimizations_raw_buffer.ll | 1026 +- .../atomic_optimizations_struct_buffer.ll | 1210 +- .../AMDGPU/atomics-hw-remarks-gfx90a.ll | 16 +- llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 70 +- llvm/test/CodeGen/AMDGPU/bf16.ll | 20 +- llvm/test/CodeGen/AMDGPU/bfe-combine.ll | 36 +- llvm/test/CodeGen/AMDGPU/bfe-patterns.ll | 150 +- llvm/test/CodeGen/AMDGPU/bfi_int.ll | 464 +- llvm/test/CodeGen/AMDGPU/bfi_nested.ll | 2 +- llvm/test/CodeGen/AMDGPU/bfm.ll | 16 +- llvm/test/CodeGen/AMDGPU/bitreverse.ll | 214 +- ...der-no-live-segment-at-def-implicit-def.ll | 67 +- llvm/test/CodeGen/AMDGPU/br_cc.f16.ll | 92 +- .../branch-folding-implicit-def-subreg.ll | 774 +- .../test/CodeGen/AMDGPU/branch-relax-spill.ll | 4 +- llvm/test/CodeGen/AMDGPU/branch-relaxation.ll | 268 +- llvm/test/CodeGen/AMDGPU/bswap.ll | 42 +- .../buffer-fat-pointer-atomicrmw-fadd.ll | 2556 ++-- .../buffer-fat-pointer-atomicrmw-fmax.ll | 1450 +- .../buffer-fat-pointer-atomicrmw-fmin.ll | 1450 +- .../CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll | 62 +- llvm/test/CodeGen/AMDGPU/build_vector.ll | 80 +- .../AMDGPU/call-alias-register-usage-agpr.ll | 2 +- .../AMDGPU/call-alias-register-usage0.ll | 2 +- .../AMDGPU/call-alias-register-usage1.ll | 2 +- .../AMDGPU/call-alias-register-usage2.ll | 2 +- .../AMDGPU/call-alias-register-usage3.ll | 2 +- llvm/test/CodeGen/AMDGPU/call-args-inreg.ll | 512 +- .../CodeGen/AMDGPU/call-argument-types.ll | 2684 ++-- .../CodeGen/AMDGPU/call-reqd-group-size.ll | 36 +- llvm/test/CodeGen/AMDGPU/call-waitcnt.ll | 89 +- .../callee-special-input-sgprs-fixed-abi.ll | 4 +- .../CodeGen/AMDGPU/calling-conventions.ll | 180 +- .../test/CodeGen/AMDGPU/carryout-selection.ll | 1834 ++- llvm/test/CodeGen/AMDGPU/cc-update.ll | 322 +- .../CodeGen/AMDGPU/cf-loop-on-constant.ll | 82 +- .../AMDGPU/cgp-addressing-modes-gfx1030.ll | 3 +- .../AMDGPU/cgp-addressing-modes-gfx908.ll | 2 +- .../CodeGen/AMDGPU/cgp-bitfield-extract.ll | 2 +- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 26 +- llvm/test/CodeGen/AMDGPU/clamp-modifier.ll | 281 +- llvm/test/CodeGen/AMDGPU/clamp.ll | 668 +- llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 96 +- llvm/test/CodeGen/AMDGPU/code-object-v3.ll | 8 +- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 42 +- .../CodeGen/AMDGPU/combine-cond-add-sub.ll | 134 +- .../CodeGen/AMDGPU/combine-reg-or-const.ll | 4 +- .../CodeGen/AMDGPU/combine-vload-extract.ll | 22 +- llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll | 133 +- .../CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll | 52 +- llvm/test/CodeGen/AMDGPU/copy_to_scc.ll | 24 +- .../AMDGPU/cross-block-use-is-not-abi-copy.ll | 60 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 322 +- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 348 +- llvm/test/CodeGen/AMDGPU/ctpop16.ll | 230 +- llvm/test/CodeGen/AMDGPU/ctpop64.ll | 292 +- llvm/test/CodeGen/AMDGPU/cttz.ll | 288 +- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 250 +- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 432 +- .../CodeGen/AMDGPU/dag-divergence-atomic.ll | 372 +- .../AMDGPU/dag-preserve-disjoint-flag.ll | 12 +- ...dagcomb-extract-vec-elt-different-sizes.ll | 36 +- .../CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll | 4 +- .../CodeGen/AMDGPU/dagcombine-setcc-select.ll | 8 +- .../AMDGPU/divergence-driven-buildvector.ll | 176 +- .../AMDGPU/divergence-driven-sext-inreg.ll | 8 +- .../AMDGPU/divergence-driven-trunc-to-i1.ll | 12 +- llvm/test/CodeGen/AMDGPU/ds-alignment.ll | 90 +- .../CodeGen/AMDGPU/ds-combine-large-stride.ll | 26 +- llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll | 24 +- llvm/test/CodeGen/AMDGPU/ds_read2.ll | 267 +- llvm/test/CodeGen/AMDGPU/ds_write2.ll | 158 +- .../AMDGPU/dwarf-multi-register-use-crash.ll | 12 +- ...cannot-create-empty-or-backward-segment.ll | 10 +- .../expand-scalar-carry-out-select-user.ll | 8 +- .../CodeGen/AMDGPU/extract_vector_dynelt.ll | 752 +- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 319 +- .../CodeGen/AMDGPU/extract_vector_elt-i16.ll | 8 +- .../CodeGen/AMDGPU/extract_vector_elt-i8.ll | 96 +- .../CodeGen/AMDGPU/extractelt-to-trunc.ll | 80 +- llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 155 +- llvm/test/CodeGen/AMDGPU/fabs.f64.ll | 82 +- llvm/test/CodeGen/AMDGPU/fabs.ll | 56 +- llvm/test/CodeGen/AMDGPU/fadd.f16.ll | 260 +- .../fast-unaligned-load-store.global.ll | 36 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 416 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 472 +- llvm/test/CodeGen/AMDGPU/fcmp.f16.ll | 1860 +-- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 1204 +- llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 448 +- llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll | 482 +- llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 432 +- llvm/test/CodeGen/AMDGPU/fdiv.ll | 601 +- .../CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll | 142 +- llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll | 58 +- llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 108 +- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 644 +- llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 4546 +++--- .../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 478 +- llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 9142 ++++++----- .../AMDGPU/flat_atomics_i64_noprivate.ll | 2450 +-- .../CodeGen/AMDGPU/flat_atomics_i64_system.ll | 12653 ++++++++++++---- .../flat_atomics_i64_system_noprivate.ll | 270 +- llvm/test/CodeGen/AMDGPU/fma-combine.ll | 994 +- llvm/test/CodeGen/AMDGPU/fma.ll | 12 +- llvm/test/CodeGen/AMDGPU/fmax3.ll | 144 +- llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll | 16 +- llvm/test/CodeGen/AMDGPU/fmaximum.ll | 20 +- llvm/test/CodeGen/AMDGPU/fmed3.ll | 1102 +- llvm/test/CodeGen/AMDGPU/fmin3.ll | 212 +- llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll | 32 +- llvm/test/CodeGen/AMDGPU/fminimum.ll | 20 +- .../AMDGPU/fmul-2-combine-multi-use.ll | 252 +- llvm/test/CodeGen/AMDGPU/fmul.f16.ll | 480 +- llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll | 472 +- llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 178 +- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 76 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 178 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll | 182 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 102 +- .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 44 +- llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 124 +- llvm/test/CodeGen/AMDGPU/fneg.ll | 286 +- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 16 +- llvm/test/CodeGen/AMDGPU/fp-classify.ll | 258 +- .../AMDGPU/fp-min-max-buffer-atomics.ll | 307 +- .../AMDGPU/fp-min-max-buffer-ptr-atomics.ll | 283 +- llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll | 8 +- llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll | 8 +- llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll | 8 +- .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 592 +- .../AMDGPU/fp64-min-max-buffer-atomics.ll | 228 +- .../AMDGPU/fp64-min-max-buffer-ptr-atomics.ll | 228 +- llvm/test/CodeGen/AMDGPU/fp_to_sint.ll | 128 +- llvm/test/CodeGen/AMDGPU/fp_to_uint.ll | 96 +- llvm/test/CodeGen/AMDGPU/fpext.f16.ll | 876 +- llvm/test/CodeGen/AMDGPU/fptosi.f16.ll | 74 +- llvm/test/CodeGen/AMDGPU/fptoui.f16.ll | 76 +- llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll | 444 +- llvm/test/CodeGen/AMDGPU/fptrunc.ll | 430 +- llvm/test/CodeGen/AMDGPU/frem.ll | 1412 +- llvm/test/CodeGen/AMDGPU/fshl.ll | 686 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 472 +- llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll | 148 +- llvm/test/CodeGen/AMDGPU/fsub.f16.ll | 348 +- .../CodeGen/AMDGPU/function-args-inreg.ll | 1157 +- .../CodeGen/AMDGPU/function-resource-usage.ll | 8 +- llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll | 60 +- llvm/test/CodeGen/AMDGPU/gds-allocation.ll | 2 +- .../CodeGen/AMDGPU/gep-const-address-space.ll | 16 +- .../AMDGPU/gfx11-user-sgpr-init16-bug.ll | 22 +- .../global-atomicrmw-fadd-wrong-subtarget.ll | 14 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 74 +- .../global-atomics-fp-wrong-subtarget.ll | 4 +- .../CodeGen/AMDGPU/global-i16-load-store.ll | 24 +- .../AMDGPU/global-load-saddr-to-vaddr.ll | 4 +- llvm/test/CodeGen/AMDGPU/global_atomics.ll | 4720 +++--- .../AMDGPU/global_atomics_i32_system.ll | 460 +- .../test/CodeGen/AMDGPU/global_atomics_i64.ll | 4234 +++--- .../AMDGPU/global_atomics_i64_system.ll | 300 +- .../AMDGPU/global_atomics_scan_fadd.ll | 5172 ++++--- .../AMDGPU/global_atomics_scan_fmax.ll | 3322 ++-- .../AMDGPU/global_atomics_scan_fmin.ll | 3322 ++-- .../AMDGPU/global_atomics_scan_fsub.ll | 4980 +++--- llvm/test/CodeGen/AMDGPU/global_smrd.ll | 4 +- .../AMDGPU/greedy-reverse-local-assignment.ll | 12 +- llvm/test/CodeGen/AMDGPU/half.ll | 494 +- .../identical-subrange-spill-infloop.ll | 2 +- llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 414 +- llvm/test/CodeGen/AMDGPU/idot2.ll | 1665 +- llvm/test/CodeGen/AMDGPU/idot4s.ll | 1686 +- llvm/test/CodeGen/AMDGPU/idot4u.ll | 2647 ++-- llvm/test/CodeGen/AMDGPU/idot8s.ll | 732 +- llvm/test/CodeGen/AMDGPU/idot8u.ll | 980 +- llvm/test/CodeGen/AMDGPU/imm.ll | 600 +- llvm/test/CodeGen/AMDGPU/imm16.ll | 488 +- .../AMDGPU/implicit-kernarg-backend-usage.ll | 44 +- .../CodeGen/AMDGPU/indirect-addressing-si.ll | 2405 ++- .../AMDGPU/indirect-addressing-term.ll | 4 +- .../AMDGPU/indirect-call-known-callees.ll | 68 +- llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 12 +- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll | 24 +- .../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 126 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 2163 +-- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 1638 +- .../AMDGPU/insert_vector_elt.v2bf16.ll | 514 +- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 757 +- .../insert_waitcnt_for_precise_memory.ll | 314 +- llvm/test/CodeGen/AMDGPU/kernel-args.ll | 1359 +- .../kernel-vgpr-spill-mubuf-with-voffset.ll | 55 +- llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 412 +- .../CodeGen/AMDGPU/lds-zero-initializer.ll | 4 +- .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 24 +- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll | 2 +- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll | 2 +- .../AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll | 2 +- .../AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll | 2 +- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll | 386 +- .../CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll | 4 +- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll | 1064 +- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll | 1020 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 4 +- .../AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll | 6 +- .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 4 +- ...vm.amdgcn.global.atomic.ordered.add.b64.ll | 27 +- .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 8 +- .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 8 +- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll | 706 +- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll | 672 +- .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 4 +- .../AMDGPU/llvm.amdgcn.intersect_ray.ll | 92 +- .../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll | 38 +- .../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll | 38 +- .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 38 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 4564 +++--- .../AMDGPU/llvm.amdgcn.permlane16.var.ll | 200 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 10 +- .../AMDGPU/llvm.amdgcn.permlane64.ptr.ll | 68 +- .../llvm.amdgcn.raw.atomic.buffer.load.ll | 20 +- .../llvm.amdgcn.raw.buffer.atomic.fadd.ll | 30 +- .../llvm.amdgcn.raw.ptr.atomic.buffer.load.ll | 20 +- ...mdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll | 8 +- ...amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll | 80 +- ...m.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll | 50 +- .../llvm.amdgcn.raw.ptr.buffer.load.bf16.ll | 96 +- .../AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll | 312 +- .../llvm.amdgcn.raw.ptr.buffer.store.ll | 336 +- .../llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll | 156 +- .../llvm.amdgcn.raw.tbuffer.store.d16.ll | 196 +- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 36 +- .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 76 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 280 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 280 +- .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 26 +- .../CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll | 2 +- .../llvm.amdgcn.sched.group.barrier.gfx11.ll | 8 +- .../llvm.amdgcn.sched.group.barrier.gfx12.ll | 8 +- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 328 +- .../CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll | 24 +- .../AMDGPU/llvm.amdgcn.set.inactive.ll | 86 +- .../llvm.amdgcn.struct.atomic.buffer.load.ll | 88 +- .../llvm.amdgcn.struct.buffer.atomic.fadd.ll | 24 +- ...vm.amdgcn.struct.ptr.atomic.buffer.load.ll | 88 +- ...cn.struct.ptr.buffer.atomic.fadd.v2bf16.ll | 4 +- ...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 64 +- ...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 40 +- ...mdgcn.struct.ptr.buffer.atomic.fmax.f32.ll | 190 +- ...mdgcn.struct.ptr.buffer.atomic.fmax.f64.ll | 108 +- ...mdgcn.struct.ptr.buffer.atomic.fmin.f32.ll | 190 +- ...mdgcn.struct.ptr.buffer.atomic.fmin.f64.ll | 108 +- ...dgcn.struct.ptr.buffer.store.format.d16.ll | 48 +- ...lvm.amdgcn.struct.ptr.tbuffer.store.d16.ll | 110 +- .../llvm.amdgcn.struct.tbuffer.store.d16.ll | 160 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll | 268 +- .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 552 +- llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll | 16 +- llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll | 36 +- llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll | 2 +- llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 737 +- llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 737 +- llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 322 +- llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll | 16 +- llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll | 480 +- llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll | 12 +- .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll | 30 +- .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 36 +- llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll | 58 +- llvm/test/CodeGen/AMDGPU/llvm.log.ll | 461 +- llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 461 +- llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 482 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 58 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 66 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll | 100 +- llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll | 592 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 46 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 66 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll | 100 +- llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll | 592 +- llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 236 +- .../AMDGPU/llvm.r600.read.local.size.ll | 108 +- llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll | 84 +- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 830 +- llvm/test/CodeGen/AMDGPU/llvm.round.ll | 770 +- llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll | 12 +- llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll | 36 +- llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/load-constant-f32.ll | 4 +- llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 16 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 266 +- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 312 +- llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 250 +- llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 48 +- llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 416 +- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 244 +- llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 206 +- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 437 +- .../CodeGen/AMDGPU/local-memory.amdgcn.ll | 6 +- .../local-stack-alloc-block-sp-reference.ll | 20 +- .../AMDGPU/long-branch-reserve-register.ll | 68 +- .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 32 +- llvm/test/CodeGen/AMDGPU/loop_break.ll | 12 +- .../AMDGPU/lower-lds-struct-aa-memcpy.ll | 2 +- .../CodeGen/AMDGPU/lower-lds-struct-aa.ll | 8 +- .../AMDGPU/lower-module-lds-via-hybrid.ll | 99 +- .../AMDGPU/lower-module-lds-via-table.ll | 90 +- .../lower-work-group-id-intrinsics-hsa.ll | 124 +- llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll | 2 +- llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll | 28 +- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 252 +- ...ne-sink-temporal-divergence-swdev407790.ll | 481 +- llvm/test/CodeGen/AMDGPU/mad.u16.ll | 24 +- .../CodeGen/AMDGPU/mad24-get-global-id.ll | 2 +- llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 102 +- llvm/test/CodeGen/AMDGPU/madak.ll | 747 +- .../match-perm-extract-vector-elt-bug.ll | 30 +- .../materialize-frame-index-sgpr.gfx10.ll | 16 +- .../CodeGen/AMDGPU/max-hard-clause-length.ll | 12 +- llvm/test/CodeGen/AMDGPU/max.i16.ll | 214 +- llvm/test/CodeGen/AMDGPU/max.ll | 346 +- llvm/test/CodeGen/AMDGPU/maximumnum.ll | 72 +- llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 44 +- .../test/CodeGen/AMDGPU/memcpy-scalar-load.ll | 16 +- .../CodeGen/AMDGPU/memmove-scalar-load.ll | 8 +- .../AMDGPU/memory-legalizer-flat-agent.ll | 2940 ++-- .../AMDGPU/memory-legalizer-flat-lastuse.ll | 24 +- .../memory-legalizer-flat-nontemporal.ll | 328 +- .../memory-legalizer-flat-singlethread.ll | 2940 ++-- .../AMDGPU/memory-legalizer-flat-system.ll | 2940 ++-- .../AMDGPU/memory-legalizer-flat-volatile.ll | 244 +- .../AMDGPU/memory-legalizer-flat-wavefront.ll | 2926 ++-- .../AMDGPU/memory-legalizer-flat-workgroup.ll | 2868 ++-- .../AMDGPU/memory-legalizer-global-agent.ll | 3013 ++-- .../AMDGPU/memory-legalizer-global-lastuse.ll | 24 +- .../memory-legalizer-global-nontemporal.ll | 362 +- .../memory-legalizer-global-singlethread.ll | 3028 ++-- .../AMDGPU/memory-legalizer-global-system.ll | 2949 ++-- .../memory-legalizer-global-volatile.ll | 291 +- .../memory-legalizer-global-wavefront.ll | 3028 ++-- .../memory-legalizer-global-workgroup.ll | 3028 ++-- .../AMDGPU/memory-legalizer-local-agent.ll | 4094 ++--- .../memory-legalizer-local-nontemporal.ll | 357 +- .../memory-legalizer-local-singlethread.ll | 4094 ++--- .../AMDGPU/memory-legalizer-local-system.ll | 4094 ++--- .../AMDGPU/memory-legalizer-local-volatile.ll | 262 +- .../memory-legalizer-local-wavefront.ll | 4094 ++--- .../memory-legalizer-local-workgroup.ll | 4094 ++--- .../memory-legalizer-private-lastuse.ll | 24 +- .../memory-legalizer-private-nontemporal.ll | 427 +- .../memory-legalizer-private-volatile.ll | 232 +- llvm/test/CodeGen/AMDGPU/memory_clause.ll | 71 +- llvm/test/CodeGen/AMDGPU/min.ll | 1226 +- llvm/test/CodeGen/AMDGPU/minimumnum.ll | 66 +- .../AMDGPU/module-lds-false-sharing.ll | 98 +- .../CodeGen/AMDGPU/move-to-valu-addsubu64.ll | 20 +- .../AMDGPU/move-to-valu-atomicrmw-system.ll | 50 +- .../CodeGen/AMDGPU/move-to-valu-atomicrmw.ll | 34 +- .../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll | 160 +- .../move-to-valu-pseudo-scalar-trans.ll | 40 +- llvm/test/CodeGen/AMDGPU/mul.ll | 1745 ++- llvm/test/CodeGen/AMDGPU/mul_int24.ll | 234 +- llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 426 +- llvm/test/CodeGen/AMDGPU/multilevel-break.ll | 2 +- .../AMDGPU/need-fp-from-vgpr-spills.ll | 66 +- .../CodeGen/AMDGPU/nested-loop-conditions.ll | 2 +- llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 67 +- llvm/test/CodeGen/AMDGPU/offset-split-flat.ll | 308 +- .../CodeGen/AMDGPU/offset-split-global.ll | 264 +- llvm/test/CodeGen/AMDGPU/omod.ll | 32 +- llvm/test/CodeGen/AMDGPU/optimize-compare.ll | 26 +- .../CodeGen/AMDGPU/optimize-negated-cond.ll | 4 +- llvm/test/CodeGen/AMDGPU/or.ll | 500 +- llvm/test/CodeGen/AMDGPU/pack.v2f16.ll | 60 +- llvm/test/CodeGen/AMDGPU/pack.v2i16.ll | 54 +- .../AMDGPU/partial-sgpr-to-vgpr-spills.ll | 16 +- .../CodeGen/AMDGPU/partial-shift-shrink.ll | 2 +- llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll | 4 +- llvm/test/CodeGen/AMDGPU/permute.ll | 52 +- llvm/test/CodeGen/AMDGPU/permute_i8.ll | 96 +- .../AMDGPU/post-ra-soft-clause-dbg-info.ll | 8 +- .../AMDGPU/preload-implicit-kernargs.ll | 16 +- .../AMDGPU/preserve-wwm-copy-dst-reg.ll | 124 +- .../CodeGen/AMDGPU/private-memory-atomics.ll | 12 +- .../AMDGPU/promote-constOffset-to-imm.ll | 168 +- .../AMDGPU/ptr-buffer-alias-scheduling.ll | 100 +- llvm/test/CodeGen/AMDGPU/rcp-pattern.ll | 164 +- .../AMDGPU/reassoc-mul-add-1-to-mad.ll | 137 +- llvm/test/CodeGen/AMDGPU/rotl.ll | 172 +- llvm/test/CodeGen/AMDGPU/rotr.ll | 130 +- llvm/test/CodeGen/AMDGPU/rsq.f32.ll | 70 +- llvm/test/CodeGen/AMDGPU/s-barrier.ll | 103 +- llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll | 52 +- llvm/test/CodeGen/AMDGPU/sad.ll | 142 +- llvm/test/CodeGen/AMDGPU/saddo.ll | 336 +- llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll | 20 +- .../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll | 16 +- .../scc-clobbered-sgpr-to-vmem-spill.ll | 6 +- .../AMDGPU/schedule-amdgpu-trackers.ll | 6 +- llvm/test/CodeGen/AMDGPU/sdiv.ll | 642 +- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 394 +- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 1004 +- .../CodeGen/AMDGPU/select-constant-cttz.ll | 2 +- llvm/test/CodeGen/AMDGPU/select.f16.ll | 300 +- .../AMDGPU/sext-divergence-driven-isel.ll | 44 +- llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll | 101 +- .../CodeGen/AMDGPU/sgpr-copy-local-cse.ll | 2 +- .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll | 4 +- .../sgpr-spill-update-only-slot-indexes.ll | 24 +- llvm/test/CodeGen/AMDGPU/shift-i128.ll | 12 +- llvm/test/CodeGen/AMDGPU/shl.ll | 354 +- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 266 +- .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 840 +- llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll | 44 +- .../AMDGPU/si-annotate-cfg-loop-assert.ll | 2 +- .../si-unify-exit-multiple-unreachables.ll | 46 +- llvm/test/CodeGen/AMDGPU/sign_extend.ll | 220 +- .../CodeGen/AMDGPU/simple-indirect-call.ll | 33 +- llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 64 +- llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll | 302 +- llvm/test/CodeGen/AMDGPU/sitofp.f16.ll | 140 +- llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll | 2 +- llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 398 +- llvm/test/CodeGen/AMDGPU/smrd.ll | 4 +- llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll | 2 +- llvm/test/CodeGen/AMDGPU/spill-m0.ll | 2 +- .../AMDGPU/spill-offset-calculation.ll | 30 +- .../CodeGen/AMDGPU/spill-scavenge-offset.ll | 40 +- .../CodeGen/AMDGPU/spill-vector-superclass.ll | 10 +- .../CodeGen/AMDGPU/spill-writelane-vgprs.ll | 2 +- llvm/test/CodeGen/AMDGPU/sra.ll | 100 +- llvm/test/CodeGen/AMDGPU/srem.ll | 818 +- llvm/test/CodeGen/AMDGPU/srem64.ll | 564 +- llvm/test/CodeGen/AMDGPU/srl.ll | 36 +- ...tack-pointer-offset-relative-frameindex.ll | 18 +- .../CodeGen/AMDGPU/stacksave_stackrestore.ll | 136 +- llvm/test/CodeGen/AMDGPU/store-local.128.ll | 360 +- llvm/test/CodeGen/AMDGPU/store-local.96.ll | 328 +- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 96 +- llvm/test/CodeGen/AMDGPU/sub.ll | 348 +- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 556 +- ...-call-inreg-arguments.convergencetokens.ll | 30 +- .../AMDGPU/tail-call-inreg-arguments.ll | 38 +- ...-in-vgprs-issue110930.convergencetokens.ll | 30 +- llvm/test/CodeGen/AMDGPU/trunc-combine.ll | 40 +- llvm/test/CodeGen/AMDGPU/trunc-store.ll | 168 +- llvm/test/CodeGen/AMDGPU/trunc.ll | 2 +- .../AMDGPU/tuple-allocation-failure.ll | 330 +- llvm/test/CodeGen/AMDGPU/uaddo.ll | 232 +- llvm/test/CodeGen/AMDGPU/udiv.ll | 112 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 158 +- llvm/test/CodeGen/AMDGPU/udivrem.ll | 375 +- llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 146 +- llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll | 228 +- llvm/test/CodeGen/AMDGPU/uitofp.f16.ll | 140 +- llvm/test/CodeGen/AMDGPU/uniform-cfg.ll | 220 +- llvm/test/CodeGen/AMDGPU/uniform-select.ll | 8 +- .../AMDGPU/unstructured-cfg-def-use-issue.ll | 14 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 368 +- llvm/test/CodeGen/AMDGPU/usubo.ll | 232 +- .../CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll | 8 +- llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll | 4 +- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 940 +- llvm/test/CodeGen/AMDGPU/v_madak_f16.ll | 200 +- llvm/test/CodeGen/AMDGPU/v_pack.ll | 82 +- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 122 +- .../CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll | 8 +- .../CodeGen/AMDGPU/vector-extract-insert.ll | 44 +- .../CodeGen/AMDGPU/vector_shuffle.packed.ll | 46 +- llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll | 26 +- .../AMDGPU/vgpr-spill-placement-issue61083.ll | 2 +- .../CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll | 52 +- .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 286 +- llvm/test/CodeGen/AMDGPU/vselect.ll | 144 +- .../CodeGen/AMDGPU/waterfall_kills_scc.ll | 12 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 684 +- llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 72 +- .../AMDGPU/workgroup-id-in-arch-sgprs.ll | 32 +- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 388 +- llvm/test/CodeGen/AMDGPU/xor.ll | 398 +- .../AMDGPU/zext-divergence-driven-isel.ll | 26 +- .../MIR/AMDGPU/machine-function-info.ll | 13 +- .../InferAddressSpaces/AMDGPU/flat_atomic.ll | 84 +- .../Inputs/amdgpu_isel.ll.expected | 10 +- 571 files changed, 124086 insertions(+), 133397 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.o diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index ab62e530a18d0c..360f29e1551fae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -465,9 +465,7 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, CCInfo.AllocateReg(DispatchPtrReg); } - const Module *M = MF.getFunction().getParent(); - if (UserSGPRInfo.hasQueuePtr() && - AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { + if (UserSGPRInfo.hasQueuePtr()) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 419414e5bd993d..37dc433d154f64 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2376,9 +2376,7 @@ void SITargetLowering::allocateSpecialInputSGPRs( if (UserSGPRInfo.hasDispatchPtr()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); - const Module *M = MF.getFunction().getParent(); - if (UserSGPRInfo.hasQueuePtr() && - AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) + if (UserSGPRInfo.hasQueuePtr()) allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); // Implicit arg ptr takes the place of the kernarg segment pointer. This is a @@ -2429,9 +2427,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, CCInfo.AllocateReg(DispatchPtrReg); } - const Module *M = MF.getFunction().getParent(); - if (UserSGPRInfo.hasQueuePtr() && - AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { + if (UserSGPRInfo.hasQueuePtr()) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll index ad3c588f575512..6a04dd492fcea6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll @@ -6,28 +6,28 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: s_add_u64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_u32 s0, s6, s0 -; GFX11-NEXT: s_addc_u32 s1, s7, s1 +; GFX11-NEXT: s_add_u32 s2, s2, s4 +; GFX11-NEXT: s_addc_u32 s3, s3, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_add_u64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: %add = add i64 %a, %b @@ -52,28 +52,28 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: s_sub_u64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_u32 s0, s6, s0 -; GFX11-NEXT: s_subb_u32 s1, s7, s1 +; GFX11-NEXT: s_sub_u32 s2, s2, s4 +; GFX11-NEXT: s_subb_u32 s3, s3, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_u64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: %sub = sub i64 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index d38a9051175bed..23f24a9dc9982a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -1494,7 +1494,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1504,7 +1504,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 @@ -1531,7 +1531,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1542,13 +1542,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1557,14 +1553,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1573,28 +1565,24 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: v_mov_b32_e32 v2, s20 ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1604,28 +1592,24 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX908-NEXT: v_max_f32_e32 v4, v0, v3 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1635,26 +1619,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX8-NEXT: v_max_f32_e32 v4, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1670,7 +1650,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -1680,7 +1660,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 @@ -1706,7 +1686,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1717,13 +1697,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1732,13 +1708,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1746,28 +1718,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1776,28 +1744,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1806,27 +1770,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX8-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1842,7 +1802,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s16 ; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen @@ -1875,7 +1835,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1885,7 +1845,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s16 ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen @@ -1917,13 +1877,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1932,12 +1888,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1945,16 +1897,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v6, s18 +; GFX908-NEXT: v_mov_b32_e32 v6, s20 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1966,30 +1914,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v6, s20 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2001,26 +1945,22 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2036,7 +1976,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s16 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen @@ -2067,7 +2007,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2077,7 +2017,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s16 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen @@ -2107,13 +2047,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2122,12 +2058,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[16:19], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2135,14 +2067,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v6, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v6, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -2152,30 +2080,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: v_mov_b32_e32 v9, v2 ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX908-NEXT: v_mov_b32_e32 v2, v7 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v6, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2185,28 +2109,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: v_mov_b32_e32 v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v7 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 3678eb5ac76821..11024b0a88d6b7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -1494,7 +1494,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1504,7 +1504,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 @@ -1531,7 +1531,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1542,13 +1542,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1557,14 +1553,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1573,28 +1565,24 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v0, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: v_mov_b32_e32 v2, s20 ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1604,28 +1592,24 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX908-NEXT: v_min_f32_e32 v4, v0, v3 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1635,26 +1619,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX8-NEXT: v_min_f32_e32 v4, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1670,7 +1650,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -1680,7 +1660,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 @@ -1706,7 +1686,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1717,13 +1697,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1732,13 +1708,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1746,28 +1718,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1776,28 +1744,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1806,27 +1770,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX8-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1842,7 +1802,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s16 ; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen @@ -1875,7 +1835,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1885,7 +1845,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s16 ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen @@ -1917,13 +1877,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1932,12 +1888,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1945,16 +1897,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v6, s18 +; GFX908-NEXT: v_mov_b32_e32 v6, s20 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1966,30 +1914,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v6, s20 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2001,26 +1945,22 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2036,7 +1976,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s16 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen @@ -2067,7 +2007,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2077,7 +2017,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s16 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen @@ -2107,13 +2047,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2122,12 +2058,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[16:19], 0 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2135,14 +2067,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v6, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v6, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -2152,30 +2080,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: v_mov_b32_e32 v9, v2 ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX908-NEXT: v_mov_b32_e32 v2, v7 -; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v6, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2185,28 +2109,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: v_mov_b32_e32 v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v7 -; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index ea44612465579c..b96fc71be057e7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -16,8 +16,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -31,8 +31,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,8 +46,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_dec_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -59,11 +59,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -74,10 +74,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_dec_rtn_u32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -93,8 +93,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -108,8 +108,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -123,8 +123,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -136,11 +136,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -151,10 +151,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -171,7 +171,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -182,7 +182,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_dec_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -193,7 +193,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_dec_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -214,7 +214,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_dec_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: ds_dec_u32 v0, v1 @@ -228,7 +228,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -239,7 +239,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -250,7 +250,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -260,7 +260,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -271,7 +271,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: ds_dec_u32 v1, v0 offset:16 @@ -286,7 +286,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -301,7 +301,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -316,7 +316,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_dec_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -328,7 +328,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -341,7 +341,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] glc @@ -358,7 +358,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -375,7 +375,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -392,7 +392,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -404,7 +404,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -417,7 +417,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -435,7 +435,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -452,7 +452,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -469,7 +469,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -481,7 +481,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -494,7 +494,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -512,7 +512,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -524,7 +524,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_dec_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -536,7 +536,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_dec_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -547,7 +547,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -559,7 +559,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_dec_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] @@ -574,7 +574,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -588,7 +588,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -602,7 +602,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -613,7 +613,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -625,7 +625,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 @@ -641,7 +641,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -655,7 +655,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -669,7 +669,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -680,7 +680,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -692,7 +692,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 @@ -708,7 +708,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -730,7 +730,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -752,7 +752,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -764,7 +764,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -777,7 +777,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -800,7 +800,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -817,7 +817,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -834,7 +834,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -845,7 +845,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -857,7 +857,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -877,7 +877,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -892,7 +892,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -907,7 +907,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -922,7 +922,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -938,7 +938,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -957,7 +957,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -974,7 +974,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -991,7 +991,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1006,7 +1006,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -1024,7 +1024,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -1061,7 +1061,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -1078,7 +1078,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1093,7 +1093,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -1111,7 +1111,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1131,7 +1131,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1143,7 +1143,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1155,7 +1155,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1167,7 +1167,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1181,7 +1181,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1198,7 +1198,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1212,7 +1212,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1226,7 +1226,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1238,7 +1238,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -1254,7 +1254,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1272,7 +1272,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1286,7 +1286,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1300,7 +1300,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -1328,7 +1328,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1368,7 +1368,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1390,7 +1390,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1410,7 +1410,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1433,7 +1433,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -1464,7 +1464,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1481,7 +1481,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1498,7 +1498,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1532,7 +1532,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -1558,7 +1558,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1579,7 +1579,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1600,7 +1600,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1616,7 +1616,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -1653,7 +1653,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1676,7 +1676,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1699,7 +1699,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1715,7 +1715,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1734,7 +1734,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -1755,7 +1755,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1768,7 +1768,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1781,7 +1781,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1794,7 +1794,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1809,7 +1809,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1827,7 +1827,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1842,7 +1842,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1857,7 +1857,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1870,7 +1870,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1887,7 +1887,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1906,7 +1906,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1921,7 +1921,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1936,7 +1936,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1949,7 +1949,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1966,7 +1966,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1985,7 +1985,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -2011,7 +2011,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -2037,7 +2037,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2058,7 +2058,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -2082,7 +2082,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -2114,7 +2114,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2132,7 +2132,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2150,7 +2150,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2166,7 +2166,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -2186,7 +2186,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -2212,7 +2212,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_dec_shl_base_lds_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; CI-NEXT: v_mov_b32_e32 v2, 9 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2230,7 +2230,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_dec_shl_base_lds_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 9 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2248,7 +2248,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_dec_shl_base_lds_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2264,7 +2264,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 9 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 @@ -2278,7 +2278,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; GFX11-LABEL: atomic_dec_shl_base_lds_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 @@ -2303,8 +2303,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2319,8 +2319,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2335,8 +2335,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_dec_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2349,12 +2349,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2365,11 +2365,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_dec_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2385,8 +2385,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2401,8 +2401,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2417,8 +2417,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2431,12 +2431,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2447,11 +2447,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2468,7 +2468,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2480,7 +2480,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_dec_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2492,7 +2492,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_dec_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2503,7 +2503,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2515,7 +2515,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_dec_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -2530,7 +2530,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2542,7 +2542,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2554,7 +2554,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2565,7 +2565,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2577,7 +2577,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -2593,7 +2593,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2609,7 +2609,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2625,7 +2625,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_dec_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2638,7 +2638,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2652,7 +2652,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_dec_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2670,7 +2670,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2688,7 +2688,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2706,7 +2706,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2719,7 +2719,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2733,7 +2733,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2752,7 +2752,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2770,7 +2770,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2788,7 +2788,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2801,7 +2801,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2815,7 +2815,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2834,7 +2834,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2847,7 +2847,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_dec_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2860,7 +2860,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_dec_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2872,7 +2872,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2885,7 +2885,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_dec_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2901,7 +2901,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2916,7 +2916,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2931,7 +2931,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2943,7 +2943,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2956,7 +2956,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2973,7 +2973,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2988,7 +2988,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3003,7 +3003,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3015,7 +3015,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -3028,7 +3028,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3045,7 +3045,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -3068,7 +3068,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -3091,7 +3091,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -3104,7 +3104,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -3118,7 +3118,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -3142,7 +3142,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -3160,7 +3160,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3178,7 +3178,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -3190,7 +3190,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -3203,7 +3203,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -3224,7 +3224,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_dec_shl_base_lds_0_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -3243,7 +3243,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_dec_shl_base_lds_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 @@ -3263,7 +3263,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX9-LABEL: atomic_dec_shl_base_lds_0_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3280,7 +3280,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 @@ -3296,7 +3296,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 9 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index 4023e053c66c5b..dc9e1f24438302 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -16,8 +16,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -31,8 +31,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,8 +46,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -59,11 +59,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -74,10 +74,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -93,8 +93,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -108,8 +108,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -123,8 +123,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -136,11 +136,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -151,10 +151,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -171,7 +171,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -182,7 +182,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_inc_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -193,7 +193,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_inc_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -214,7 +214,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_inc_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: ds_inc_u32 v0, v1 @@ -228,7 +228,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -239,7 +239,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -250,7 +250,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -260,7 +260,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -271,7 +271,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: ds_inc_u32 v1, v0 offset:16 @@ -286,7 +286,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -301,7 +301,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -316,7 +316,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -328,7 +328,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -341,7 +341,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] glc @@ -358,7 +358,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -375,7 +375,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -392,7 +392,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -404,7 +404,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -417,7 +417,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -435,7 +435,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -452,7 +452,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -469,7 +469,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -481,7 +481,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -494,7 +494,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -512,7 +512,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -524,7 +524,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_inc_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -536,7 +536,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_inc_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -547,7 +547,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -559,7 +559,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_inc_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] @@ -574,7 +574,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -588,7 +588,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -602,7 +602,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -613,7 +613,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -625,7 +625,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 @@ -641,7 +641,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -655,7 +655,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -669,7 +669,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -680,7 +680,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -692,7 +692,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 @@ -708,7 +708,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -730,7 +730,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -752,7 +752,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -764,7 +764,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -777,7 +777,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -800,7 +800,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -817,7 +817,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -834,7 +834,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -845,7 +845,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -857,7 +857,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -877,7 +877,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; CI-NEXT: v_mov_b32_e32 v2, 9 ; CI-NEXT: s_mov_b32 m0, -1 @@ -895,7 +895,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 9 ; VI-NEXT: s_mov_b32 m0, -1 @@ -913,7 +913,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -929,7 +929,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 9 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 @@ -943,7 +943,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; GFX11-LABEL: atomic_inc_shl_base_lds_0_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 @@ -968,8 +968,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -984,8 +984,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1000,8 +1000,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1014,12 +1014,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1030,11 +1030,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_inc_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1050,8 +1050,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1066,8 +1066,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1082,8 +1082,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1096,12 +1096,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1112,11 +1112,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1145,7 +1145,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_inc_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_inc_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1168,7 +1168,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1180,7 +1180,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_inc_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -1195,7 +1195,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1207,7 +1207,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1219,7 +1219,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1230,7 +1230,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1242,7 +1242,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_inc_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -1258,7 +1258,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1274,7 +1274,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1290,7 +1290,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1303,7 +1303,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1317,7 +1317,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_inc_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1353,7 +1353,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1384,7 +1384,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1398,7 +1398,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1417,7 +1417,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1435,7 +1435,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1453,7 +1453,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1466,7 +1466,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1499,7 +1499,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1512,7 +1512,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_inc_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1525,7 +1525,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_inc_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1537,7 +1537,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1550,7 +1550,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_inc_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1581,7 +1581,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1596,7 +1596,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1608,7 +1608,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1621,7 +1621,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1638,7 +1638,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1653,7 +1653,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1668,7 +1668,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1680,7 +1680,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1693,7 +1693,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1710,7 +1710,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -1733,7 +1733,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1756,7 +1756,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -1769,7 +1769,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -1783,7 +1783,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1825,7 +1825,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1843,7 +1843,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1855,7 +1855,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1868,7 +1868,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1889,7 +1889,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -1904,7 +1904,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1919,7 +1919,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1934,7 +1934,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -1950,7 +1950,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1969,7 +1969,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -1986,7 +1986,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2003,7 +2003,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2018,7 +2018,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -2036,7 +2036,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2056,7 +2056,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2073,7 +2073,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2090,7 +2090,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2105,7 +2105,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -2123,7 +2123,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2143,7 +2143,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2155,7 +2155,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2167,7 +2167,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2179,7 +2179,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -2193,7 +2193,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2210,7 +2210,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2224,7 +2224,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2238,7 +2238,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2250,7 +2250,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2284,7 +2284,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2298,7 +2298,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2312,7 +2312,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2324,7 +2324,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -2340,7 +2340,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2358,7 +2358,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2380,7 +2380,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2402,7 +2402,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2422,7 +2422,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2445,7 +2445,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -2476,7 +2476,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2493,7 +2493,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2510,7 +2510,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2525,7 +2525,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -2544,7 +2544,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -2570,7 +2570,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -2589,7 +2589,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 @@ -2609,7 +2609,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2626,7 +2626,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 @@ -2642,7 +2642,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 9 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v2 @@ -2667,7 +2667,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2688,7 +2688,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2709,7 +2709,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2725,7 +2725,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2742,7 +2742,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -2762,7 +2762,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2785,7 +2785,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2808,7 +2808,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2824,7 +2824,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2843,7 +2843,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -2864,7 +2864,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2887,7 +2887,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2910,7 +2910,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2926,7 +2926,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2945,7 +2945,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -2966,7 +2966,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2979,7 +2979,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2992,7 +2992,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3005,7 +3005,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3020,7 +3020,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3038,7 +3038,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3053,7 +3053,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3068,7 +3068,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3081,7 +3081,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3098,7 +3098,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3117,7 +3117,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3132,7 +3132,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3147,7 +3147,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3160,7 +3160,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3177,7 +3177,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3196,7 +3196,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -3222,7 +3222,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -3248,7 +3248,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3269,7 +3269,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -3293,7 +3293,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -3325,7 +3325,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -3343,7 +3343,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3361,7 +3361,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3377,7 +3377,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -3397,7 +3397,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -3423,8 +3423,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: nocse_lds_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3443,8 +3443,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; VI-LABEL: nocse_lds_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3463,8 +3463,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3479,11 +3479,11 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; GFX10-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3498,10 +3498,10 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; GFX11-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll index c45bccd184c12f..876f1622a24a71 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll @@ -66,7 +66,7 @@ define amdgpu_ps i32 @select_sgpr_trunc_and_cond(i32 inreg %a.0, i32 inreg %a.1, define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) { ; WAVE64-LABEL: sgpr_trunc_brcond: ; WAVE64: ; %bb.0: ; %entry -; WAVE64-NEXT: s_load_dword s0, s[2:3], 0x24 +; WAVE64-NEXT: s_load_dword s0, s[4:5], 0x24 ; WAVE64-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-NEXT: s_xor_b32 s0, s0, 1 ; WAVE64-NEXT: s_and_b32 s0, s0, 1 @@ -83,7 +83,7 @@ define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) { ; ; WAVE32-LABEL: sgpr_trunc_brcond: ; WAVE32: ; %bb.0: ; %entry -; WAVE32-NEXT: s_load_dword s0, s[2:3], 0x24 +; WAVE32-NEXT: s_load_dword s0, s[4:5], 0x24 ; WAVE32-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-NEXT: s_xor_b32 s0, s0, 1 ; WAVE32-NEXT: s_and_b32 s0, s0, 1 @@ -113,7 +113,7 @@ bb1: define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) { ; WAVE64-LABEL: brcond_sgpr_trunc_and: ; WAVE64: ; %bb.0: ; %entry -; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; WAVE64-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-NEXT: s_and_b32 s0, s0, s1 ; WAVE64-NEXT: s_xor_b32 s0, s0, 1 @@ -131,7 +131,7 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) { ; ; WAVE32-LABEL: brcond_sgpr_trunc_and: ; WAVE32: ; %bb.0: ; %entry -; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; WAVE32-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-NEXT: s_and_b32 s0, s0, s1 ; WAVE32-NEXT: s_xor_b32 s0, s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll index 091c9f143ce7ee..20735bb6c21c6c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll @@ -4,12 +4,12 @@ define ptr @buffer_load_p0(ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_load_p0 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX9-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.buf, align 1, addrspace 8) @@ -25,15 +25,15 @@ define ptr @buffer_load_p0(ptr addrspace(8) inreg %buf) { define void @buffer_store_p0(ptr %data, ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_store_p0 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX9-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.buf, align 1, addrspace 8) @@ -45,12 +45,12 @@ define void @buffer_store_p0(ptr %data, ptr addrspace(8) inreg %buf) { define ptr addrspace(1) @buffer_load_p1(ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_load_p1 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX9-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.buf, align 1, addrspace 8) @@ -66,15 +66,15 @@ define ptr addrspace(1) @buffer_load_p1(ptr addrspace(8) inreg %buf) { define void @buffer_store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_store_p1 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX9-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.buf, align 1, addrspace 8) @@ -86,12 +86,12 @@ define void @buffer_store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf define ptr addrspace(4) @buffer_load_p4(ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_load_p4 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX9-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.buf, align 1, addrspace 8) @@ -107,15 +107,15 @@ define ptr addrspace(4) @buffer_load_p4(ptr addrspace(8) inreg %buf) { define void @buffer_store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_store_p4 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; GFX9-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.buf, align 1, addrspace 8) @@ -127,12 +127,12 @@ define void @buffer_store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf define ptr addrspace(5) @buffer_load_p5(ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_load_p5 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX9-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.buf, align 1, addrspace 8) @@ -145,13 +145,13 @@ define ptr addrspace(5) @buffer_load_p5(ptr addrspace(8) inreg %buf) { define void @buffer_store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_store_p5 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.buf, align 1, addrspace 8) @@ -163,12 +163,12 @@ define void @buffer_store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf define <2 x ptr addrspace(1)> @buffer_load_v2p1(ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_load_v2p1 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX9-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s64>) from %ir.buf, align 1, addrspace 8) @@ -188,7 +188,7 @@ define <2 x ptr addrspace(1)> @buffer_load_v2p1(ptr addrspace(8) inreg %buf) { define void @buffer_store_v2p5(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_store_v2p5 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 @@ -197,10 +197,10 @@ define void @buffer_store_v2p5(<2 x ptr addrspace(1)> %data, ptr addrspace(8) in ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; GFX9-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s64>) into %ir.buf, align 1, addrspace 8) @@ -212,12 +212,12 @@ define void @buffer_store_v2p5(<2 x ptr addrspace(1)> %data, ptr addrspace(8) in define <3 x ptr addrspace(5)> @buffer_load_v3p5(ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_load_v3p5 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX9-NEXT: [[BUFFER_LOAD_DWORDX3_OFFSET:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX3_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from %ir.buf, align 1, addrspace 8) @@ -235,16 +235,16 @@ define <3 x ptr addrspace(5)> @buffer_load_v3p5(ptr addrspace(8) inreg %buf) { define void @buffer_store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_store_v3p5 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 ; GFX9-NEXT: BUFFER_STORE_DWORDX3_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>) into %ir.buf, align 1, addrspace 8) @@ -256,12 +256,12 @@ define void @buffer_store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) in define <4 x ptr addrspace(5)> @buffer_load_v4p5(ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_load_v4p5 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX9-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.buf, align 1, addrspace 8) @@ -281,17 +281,17 @@ define <4 x ptr addrspace(5)> @buffer_load_v4p5(ptr addrspace(8) inreg %buf) { define void @buffer_store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) { ; GFX9-LABEL: name: buffer_store_v4p5 ; GFX9: bb.1 (%ir-block.0): - ; GFX9-NEXT: liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9-NEXT: liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr16 - ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr19 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; GFX9-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.buf, align 1, addrspace 8) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index e4c609c9331086..974ce492daea8b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -12,9 +12,9 @@ declare hidden void @external_void_func_byval(ptr addrspace(5) byval([16 x i32]) define amdgpu_kernel void @kernel_caller_stack() { ; MUBUF-LABEL: kernel_caller_stack: ; MUBUF: ; %bb.0: -; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; MUBUF-NEXT: s_add_u32 s0, s0, s17 ; MUBUF-NEXT: s_mov_b32 s32, 0 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 @@ -34,8 +34,8 @@ define amdgpu_kernel void @kernel_caller_stack() { ; FLATSCR-LABEL: kernel_caller_stack: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_mov_b32 s32, 0 -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: s_add_u32 s0, s32, 4 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 @@ -60,9 +60,9 @@ define amdgpu_kernel void @kernel_caller_stack() { define amdgpu_kernel void @kernel_caller_byval() { ; MUBUF-LABEL: kernel_caller_byval: ; MUBUF: ; %bb.0: -; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; MUBUF-NEXT: s_add_u32 s0, s0, s17 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -155,9 +155,9 @@ define amdgpu_kernel void @kernel_caller_byval() { ; ; FLATSCR-LABEL: kernel_caller_byval: ; FLATSCR: ; %bb.0: -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll index 84378bcb706846..515b9f8955d59d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @stack_write_fi() { ; CHECK-LABEL: stack_write_fi: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: s_mov_b32 s6, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index 405b1e8f3a250f..168bf16ad68674 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -452,7 +452,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -468,7 +468,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -493,7 +493,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -513,7 +513,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -539,7 +539,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -562,7 +562,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -589,7 +589,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -612,7 +612,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -644,7 +644,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -679,7 +679,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -725,17 +725,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: s_mov_b64 s[0:1], s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -754,7 +754,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v9 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 @@ -763,13 +763,13 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 8 @@ -779,7 +779,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -821,7 +821,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -858,7 +858,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -918,7 +918,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -949,7 +949,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -986,7 +986,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1005,7 +1005,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1033,7 +1033,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1051,7 +1051,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1080,7 +1080,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1204,7 +1204,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1221,7 +1221,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1247,7 +1247,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1310,7 +1310,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1337,7 +1337,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1354,7 +1354,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1381,7 +1381,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index f52b7c635a66f1..e7ddfda2875dba 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -193,7 +193,7 @@ bb12: define amdgpu_kernel void @break_loop(i32 %arg) { ; CHECK-LABEL: break_loop: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 ; CHECK-NEXT: ; implicit-def: $sgpr2_sgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll index 56bd7ddde6f527..aae999ec0a99a5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll @@ -7,42 +7,43 @@ declare void @callee() define amdgpu_kernel void @call_debug_loc() { ; CHECK-LABEL: name: call_debug_loc ; CHECK: bb.1.entry: - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2, debug-location !7 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1, debug-location !7 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0, debug-location !7 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14, debug-location !7 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13, debug-location !7 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12, debug-location !7 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9, debug-location !7 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5, debug-location !7 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16, debug-location !7 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15, debug-location !7 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14, debug-location !7 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11, debug-location !7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7, debug-location !7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5, debug-location !7 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, debug-location !7 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[COPY7]], debug-location !7 - ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF debug-location !7 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[COPY6]], debug-location !7 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY5]], debug-location !7 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY4]], debug-location !7 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY3]], debug-location !7 - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF debug-location !7 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[COPY8]], debug-location !7 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[COPY7]], debug-location !7 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[COPY6]], debug-location !7 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY5]], debug-location !7 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]], debug-location !7 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY3]], debug-location !7 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF debug-location !7 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10, debug-location !7 - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], debug-location !7 - ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY14]], [[COPY1]], implicit $exec, debug-location !7 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], debug-location !7 + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[COPY1]], implicit $exec, debug-location !7 ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 20, debug-location !7 - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]], debug-location !7 - ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[COPY]], implicit $exec, debug-location !7 + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]], debug-location !7 + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY17]], [[COPY]], implicit $exec, debug-location !7 ; CHECK-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 [[COPY2]], [[V_LSHLREV_B32_e64_]], [[V_LSHLREV_B32_e64_1]], implicit $exec, debug-location !7 - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3, debug-location !7 - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]], debug-location !7 - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]], debug-location !7 - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]], debug-location !7 - ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY8]], debug-location !7 - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]], debug-location !7 - ; CHECK-NEXT: $sgpr12 = COPY [[COPY11]], debug-location !7 - ; CHECK-NEXT: $sgpr13 = COPY [[COPY12]], debug-location !7 - ; CHECK-NEXT: $sgpr14 = COPY [[COPY13]], debug-location !7 - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]], debug-location !7 + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3, debug-location !7 + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]], debug-location !7 + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]], debug-location !7 + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]], debug-location !7 + ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY9]], debug-location !7 + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY12]], debug-location !7 + ; CHECK-NEXT: $sgpr12 = COPY [[COPY13]], debug-location !7 + ; CHECK-NEXT: $sgpr13 = COPY [[COPY14]], debug-location !7 + ; CHECK-NEXT: $sgpr14 = COPY [[COPY15]], debug-location !7 + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]], debug-location !7 ; CHECK-NEXT: $vgpr31 = COPY [[V_OR3_B32_e64_]], debug-location !7 ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee, target-flags(amdgpu-gotprel32-hi) @callee, implicit-def $scc, debug-location !7 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0, debug-location !7 :: (dereferenceable invariant load (p0) from got, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll index 48986ea9ef9825..741323a201d02e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -25,8 +25,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s17 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -42,7 +42,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s32, 16 ; GFX11-NEXT: s_mov_b32 s33, 0 @@ -143,8 +143,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() { define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -160,8 +160,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s17 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -177,7 +177,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s32, 16 ; GFX11-NEXT: s_mov_b32 s33, 0 @@ -278,8 +278,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() { define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -296,8 +296,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s17 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_movk_i32 s32, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -314,7 +314,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s32, 32 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s33, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index ca6e5df43a0434..88a7ba7ac98928 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3027,7 +3027,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 10 +; GPRIDX-NEXT: user_sgpr_count = 12 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 13 +; GPRIDX-NEXT: wavefront_sgpr_count = 15 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -3075,20 +3075,20 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GPRIDX-NEXT: s_load_dword s8, s[6:7], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GPRIDX-NEXT: s_load_dword s10, s[8:9], 0x8 ; GPRIDX-NEXT: s_mov_b32 s4, 0 ; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000 ; GPRIDX-NEXT: s_mov_b32 s2, 0 ; GPRIDX-NEXT: s_mov_b32 s3, 0x40140000 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 1 ; GPRIDX-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 2 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 2 ; GPRIDX-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 3 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 3 ; GPRIDX-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 4 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 4 ; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s3 @@ -3118,7 +3118,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 10 +; MOVREL-NEXT: user_sgpr_count = 12 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3150,7 +3150,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 9 +; MOVREL-NEXT: wavefront_sgpr_count = 10 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -3166,8 +3166,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; MOVREL-NEXT: s_load_dword s8, s[6:7], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; MOVREL-NEXT: s_load_dword s8, s[8:9], 0x8 ; MOVREL-NEXT: s_mov_b32 s4, 0 ; MOVREL-NEXT: s_mov_b32 s5, 0x40080000 ; MOVREL-NEXT: s_mov_b32 s2, 0 @@ -3210,7 +3210,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 10 +; GFX10-NEXT: user_sgpr_count = 12 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3242,7 +3242,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: gds_segment_byte_size = 0 ; GFX10-NEXT: kernarg_segment_byte_size = 28 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 9 +; GFX10-NEXT: wavefront_sgpr_count = 10 ; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -3259,21 +3259,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s8, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x40080000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s8, 1 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 2 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_cmp_eq_u32 s8, 3 +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s5, 0x40140000 ; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] -; GFX10-NEXT: s_cmp_eq_u32 s8, 4 +; GFX10-NEXT: s_cmp_eq_u32 s6, 4 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -3351,8 +3351,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x40080000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -4042,7 +4042,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 10 +; GPRIDX-NEXT: user_sgpr_count = 12 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4074,7 +4074,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 12 +; GPRIDX-NEXT: wavefront_sgpr_count = 14 ; GPRIDX-NEXT: workitem_vgpr_count = 2 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4090,8 +4090,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dword s2, s[6:7], 0x8 -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GPRIDX-NEXT: s_load_dword s2, s[8:9], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) ; GPRIDX-NEXT: s_cmp_eq_u32 s2, 1 @@ -4115,7 +4115,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 0 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -4126,7 +4126,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 10 +; MOVREL-NEXT: user_sgpr_count = 12 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4158,7 +4158,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 8 +; MOVREL-NEXT: wavefront_sgpr_count = 10 ; MOVREL-NEXT: workitem_vgpr_count = 3 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4174,8 +4174,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dword s2, s[6:7], 0x8 -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; MOVREL-NEXT: s_load_dword s2, s[8:9], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s2, 1 ; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0 @@ -4200,7 +4200,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -4211,7 +4211,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 10 +; GFX10-NEXT: user_sgpr_count = 12 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4243,7 +4243,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: gds_segment_byte_size = 0 ; GFX10-NEXT: kernarg_segment_byte_size = 28 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 8 +; GFX10-NEXT: wavefront_sgpr_count = 10 ; GFX10-NEXT: workitem_vgpr_count = 2 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -4260,8 +4260,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 @@ -4328,7 +4328,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: gds_segment_byte_size = 0 ; GFX11-NEXT: kernarg_segment_byte_size = 28 ; GFX11-NEXT: workgroup_fbarrier_count = 0 -; GFX11-NEXT: wavefront_sgpr_count = 5 +; GFX11-NEXT: wavefront_sgpr_count = 6 ; GFX11-NEXT: workitem_vgpr_count = 2 ; GFX11-NEXT: reserved_vgpr_first = 0 ; GFX11-NEXT: reserved_vgpr_count = 0 @@ -4345,16 +4345,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s4, 1 -; GFX11-NEXT: s_cselect_b32 s2, 2.0, 1.0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 2 -; GFX11-NEXT: s_cselect_b32 s2, 0x40400000, s2 -; GFX11-NEXT: s_cmp_eq_u32 s4, 3 -; GFX11-NEXT: s_cselect_b32 s2, 4.0, s2 +; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: s_cselect_b32 s3, 2.0, 1.0 +; GFX11-NEXT: s_cmp_eq_u32 s2, 2 +; GFX11-NEXT: s_cselect_b32 s3, 0x40400000, s3 +; GFX11-NEXT: s_cmp_eq_u32 s2, 3 +; GFX11-NEXT: s_cselect_b32 s2, 4.0, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm @@ -4387,7 +4387,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 10 +; GPRIDX-NEXT: user_sgpr_count = 12 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4419,7 +4419,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 13 +; GPRIDX-NEXT: wavefront_sgpr_count = 14 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4435,17 +4435,17 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dword s8, s[6:7], 0x8 -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GPRIDX-NEXT: s_load_dword s6, s[8:9], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GPRIDX-NEXT: s_mov_b32 s2, 0 ; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 ; GPRIDX-NEXT: v_mov_b32_e32 v2, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 +; GPRIDX-NEXT: s_cmp_eq_u32 s6, 1 ; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 2 +; GPRIDX-NEXT: s_cmp_eq_u32 s6, 2 ; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 3 +; GPRIDX-NEXT: s_cmp_eq_u32 s6, 3 ; GPRIDX-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s3 @@ -4474,7 +4474,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 10 +; MOVREL-NEXT: user_sgpr_count = 12 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4506,7 +4506,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 9 +; MOVREL-NEXT: wavefront_sgpr_count = 10 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4522,16 +4522,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dword s8, s[6:7], 0x8 -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; MOVREL-NEXT: s_load_dword s6, s[8:9], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; MOVREL-NEXT: s_mov_b32 s2, 0 ; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 +; MOVREL-NEXT: s_cmp_eq_u32 s6, 1 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 +; MOVREL-NEXT: s_cmp_eq_u32 s6, 2 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; MOVREL-NEXT: s_cmp_eq_u32 s8, 3 +; MOVREL-NEXT: s_cmp_eq_u32 s6, 3 ; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 @@ -4562,7 +4562,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 10 +; GFX10-NEXT: user_sgpr_count = 12 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4594,7 +4594,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: gds_segment_byte_size = 0 ; GFX10-NEXT: kernarg_segment_byte_size = 28 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 9 +; GFX10-NEXT: wavefront_sgpr_count = 10 ; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -4611,17 +4611,17 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s8, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x40080000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s8, 1 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 2 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_cmp_eq_u32 s8, 3 +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 ; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -4699,8 +4699,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x40080000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 097640312322d0..b0f2aac9a42d45 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -14,9 +14,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 @@ -30,11 +30,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 @@ -48,7 +48,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 @@ -62,7 +62,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 @@ -76,7 +76,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 @@ -90,9 +90,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; UNALIGNED_GFX9-LABEL: store_load_sindex_kernel: ; UNALIGNED_GFX9: ; %bb.0: ; %bb -; UNALIGNED_GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; UNALIGNED_GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15 ; UNALIGNED_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED_GFX9-NEXT: s_lshl_b32 s1, s0, 2 @@ -106,11 +106,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; UNALIGNED_GFX10-LABEL: store_load_sindex_kernel: ; UNALIGNED_GFX10: ; %bb.0: ; %bb -; UNALIGNED_GFX10-NEXT: s_add_u32 s6, s6, s11 -; UNALIGNED_GFX10-NEXT: s_addc_u32 s7, s7, 0 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; UNALIGNED_GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; UNALIGNED_GFX10-NEXT: s_add_u32 s8, s8, s13 +; UNALIGNED_GFX10-NEXT: s_addc_u32 s9, s9, 0 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; UNALIGNED_GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 15 ; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED_GFX10-NEXT: s_and_b32 s1, s0, 15 @@ -124,7 +124,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; UNALIGNED_GFX940-LABEL: store_load_sindex_kernel: ; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_load_dword s0, s[2:3], 0x0 +; UNALIGNED_GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v0, 15 ; UNALIGNED_GFX940-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED_GFX940-NEXT: s_lshl_b32 s1, s0, 2 @@ -138,7 +138,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; UNALIGNED_GFX11-LABEL: store_load_sindex_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb -; UNALIGNED_GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; UNALIGNED_GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v0, 15 ; UNALIGNED_GFX11-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED_GFX11-NEXT: s_and_b32 s1, s0, 15 @@ -152,7 +152,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; UNALIGNED_GFX12-LABEL: store_load_sindex_kernel: ; UNALIGNED_GFX12: ; %bb.0: ; %bb -; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[2:3], 0x0 +; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v0, 15 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_and_b32 s1, s0, 15 @@ -176,8 +176,8 @@ bb: define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-LABEL: store_load_vindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 @@ -190,10 +190,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX10-LABEL: store_load_vindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -246,8 +246,8 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; UNALIGNED_GFX9-LABEL: store_load_vindex_kernel: ; UNALIGNED_GFX9: ; %bb.0: ; %bb -; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; UNALIGNED_GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 @@ -260,10 +260,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; UNALIGNED_GFX10-LABEL: store_load_vindex_kernel: ; UNALIGNED_GFX10: ; %bb.0: ; %bb -; UNALIGNED_GFX10-NEXT: s_add_u32 s6, s6, s11 -; UNALIGNED_GFX10-NEXT: s_addc_u32 s7, s7, 0 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; UNALIGNED_GFX10-NEXT: s_add_u32 s8, s8, s13 +; UNALIGNED_GFX10-NEXT: s_addc_u32 s9, s9, 0 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; UNALIGNED_GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -577,9 +577,9 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -597,11 +597,11 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -619,7 +619,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_small_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -637,7 +637,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 15 @@ -655,7 +655,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 @@ -673,9 +673,9 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; UNALIGNED_GFX9-LABEL: store_load_sindex_small_offset_kernel: ; UNALIGNED_GFX9: ; %bb.0: ; %bb -; UNALIGNED_GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; UNALIGNED_GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; UNALIGNED_GFX9-NEXT: s_mov_b32 s1, 0 ; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s1 glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -693,11 +693,11 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; UNALIGNED_GFX10-LABEL: store_load_sindex_small_offset_kernel: ; UNALIGNED_GFX10: ; %bb.0: ; %bb -; UNALIGNED_GFX10-NEXT: s_add_u32 s6, s6, s11 -; UNALIGNED_GFX10-NEXT: s_addc_u32 s7, s7, 0 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; UNALIGNED_GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; UNALIGNED_GFX10-NEXT: s_add_u32 s8, s8, s13 +; UNALIGNED_GFX10-NEXT: s_addc_u32 s9, s9, 0 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; UNALIGNED_GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -715,7 +715,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; UNALIGNED_GFX940-LABEL: store_load_sindex_small_offset_kernel: ; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_load_dword s0, s[2:3], 0x0 +; UNALIGNED_GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -733,7 +733,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; UNALIGNED_GFX11-LABEL: store_load_sindex_small_offset_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb -; UNALIGNED_GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; UNALIGNED_GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v0, 15 @@ -751,7 +751,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; UNALIGNED_GFX12-LABEL: store_load_sindex_small_offset_kernel: ; UNALIGNED_GFX12: ; %bb.0: ; %bb -; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[2:3], 0x0 +; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v0, 15 @@ -782,8 +782,8 @@ bb: define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-LABEL: store_load_vindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -801,10 +801,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -869,8 +869,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; UNALIGNED_GFX9-LABEL: store_load_vindex_small_offset_kernel: ; UNALIGNED_GFX9: ; %bb.0: ; %bb -; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0 ; UNALIGNED_GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) @@ -888,10 +888,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; UNALIGNED_GFX10-LABEL: store_load_vindex_small_offset_kernel: ; UNALIGNED_GFX10: ; %bb.0: ; %bb -; UNALIGNED_GFX10-NEXT: s_add_u32 s6, s6, s11 -; UNALIGNED_GFX10-NEXT: s_addc_u32 s7, s7, 0 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; UNALIGNED_GFX10-NEXT: s_add_u32 s8, s8, s13 +; UNALIGNED_GFX10-NEXT: s_addc_u32 s9, s9, 0 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; UNALIGNED_GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -1166,9 +1166,9 @@ bb: define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1186,11 +1186,11 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_large_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -1226,7 +1226,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 15 @@ -1244,7 +1244,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 @@ -1262,9 +1262,9 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; UNALIGNED_GFX9-LABEL: store_load_sindex_large_offset_kernel: ; UNALIGNED_GFX9: ; %bb.0: ; %bb -; UNALIGNED_GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; UNALIGNED_GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; UNALIGNED_GFX9-NEXT: s_mov_b32 s1, 0 ; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1282,11 +1282,11 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; UNALIGNED_GFX10-LABEL: store_load_sindex_large_offset_kernel: ; UNALIGNED_GFX10: ; %bb.0: ; %bb -; UNALIGNED_GFX10-NEXT: s_add_u32 s6, s6, s11 -; UNALIGNED_GFX10-NEXT: s_addc_u32 s7, s7, 0 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; UNALIGNED_GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; UNALIGNED_GFX10-NEXT: s_add_u32 s8, s8, s13 +; UNALIGNED_GFX10-NEXT: s_addc_u32 s9, s9, 0 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; UNALIGNED_GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -1304,7 +1304,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; UNALIGNED_GFX940-LABEL: store_load_sindex_large_offset_kernel: ; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_load_dword s0, s[2:3], 0x0 +; UNALIGNED_GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -1322,7 +1322,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; UNALIGNED_GFX11-LABEL: store_load_sindex_large_offset_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb -; UNALIGNED_GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; UNALIGNED_GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v0, 15 @@ -1340,7 +1340,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; UNALIGNED_GFX12-LABEL: store_load_sindex_large_offset_kernel: ; UNALIGNED_GFX12: ; %bb.0: ; %bb -; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[2:3], 0x0 +; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v0, 15 @@ -1371,8 +1371,8 @@ bb: define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-LABEL: store_load_vindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1390,10 +1390,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -1460,8 +1460,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; UNALIGNED_GFX9-LABEL: store_load_vindex_large_offset_kernel: ; UNALIGNED_GFX9: ; %bb.0: ; %bb -; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0 ; UNALIGNED_GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1479,10 +1479,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; UNALIGNED_GFX10-LABEL: store_load_vindex_large_offset_kernel: ; UNALIGNED_GFX10: ; %bb.0: ; %bb -; UNALIGNED_GFX10-NEXT: s_add_u32 s6, s6, s11 -; UNALIGNED_GFX10-NEXT: s_addc_u32 s7, s7, 0 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; UNALIGNED_GFX10-NEXT: s_add_u32 s8, s8, s13 +; UNALIGNED_GFX10-NEXT: s_addc_u32 s9, s9, 0 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; UNALIGNED_GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -1763,8 +1763,8 @@ bb: define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-LABEL: store_load_large_imm_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 @@ -1780,10 +1780,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3e80 @@ -1837,8 +1837,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; UNALIGNED_GFX9-LABEL: store_load_large_imm_offset_kernel: ; UNALIGNED_GFX9: ; %bb.0: ; %bb -; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 13 ; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 @@ -1854,10 +1854,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; UNALIGNED_GFX10-LABEL: store_load_large_imm_offset_kernel: ; UNALIGNED_GFX10: ; %bb.0: ; %bb -; UNALIGNED_GFX10-NEXT: s_add_u32 s6, s6, s11 -; UNALIGNED_GFX10-NEXT: s_addc_u32 s7, s7, 0 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; UNALIGNED_GFX10-NEXT: s_add_u32 s8, s8, s13 +; UNALIGNED_GFX10-NEXT: s_addc_u32 s9, s9, 0 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 ; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80 @@ -2095,9 +2095,9 @@ bb: define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-LABEL: store_load_vidx_sidx_offset: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -2109,11 +2109,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX10-LABEL: store_load_vidx_sidx_offset: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -2125,7 +2125,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -2138,7 +2138,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2151,7 +2151,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX12-LABEL: store_load_vidx_sidx_offset: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2164,9 +2164,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; UNALIGNED_GFX9-LABEL: store_load_vidx_sidx_offset: ; UNALIGNED_GFX9: ; %bb.0: ; %bb -; UNALIGNED_GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; UNALIGNED_GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 15 ; UNALIGNED_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED_GFX9-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -2178,11 +2178,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; UNALIGNED_GFX10-LABEL: store_load_vidx_sidx_offset: ; UNALIGNED_GFX10: ; %bb.0: ; %bb -; UNALIGNED_GFX10-NEXT: s_add_u32 s6, s6, s11 -; UNALIGNED_GFX10-NEXT: s_addc_u32 s7, s7, 0 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; UNALIGNED_GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; UNALIGNED_GFX10-NEXT: s_add_u32 s8, s8, s13 +; UNALIGNED_GFX10-NEXT: s_addc_u32 s9, s9, 0 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; UNALIGNED_GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 ; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED_GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -2194,7 +2194,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; UNALIGNED_GFX940-LABEL: store_load_vidx_sidx_offset: ; UNALIGNED_GFX940: ; %bb.0: ; %bb -; UNALIGNED_GFX940-NEXT: s_load_dword s0, s[2:3], 0x0 +; UNALIGNED_GFX940-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 15 ; UNALIGNED_GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -2207,7 +2207,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; UNALIGNED_GFX11-LABEL: store_load_vidx_sidx_offset: ; UNALIGNED_GFX11: ; %bb.0: ; %bb -; UNALIGNED_GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; UNALIGNED_GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; UNALIGNED_GFX11-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2220,7 +2220,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; UNALIGNED_GFX12-LABEL: store_load_vidx_sidx_offset: ; UNALIGNED_GFX12: ; %bb.0: ; %bb -; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[2:3], 0x0 +; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2455,13 +2455,13 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v0, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr6 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr3 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 -; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 -; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 ; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v3, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2555,13 +2555,13 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v0, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr6 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr3 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 ; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v3, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2796,17 +2796,17 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v0, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 -; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr9 -; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 -; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 -; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr12 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr4 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr11 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr6 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 ; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v4, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2944,17 +2944,17 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr12 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr4 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr11 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr6 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 ; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) @@ -3282,6 +3282,10 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v15, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr16 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr11 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr4 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr15 @@ -3292,10 +3296,6 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr14 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr12 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 -; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 -; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 -; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr9 -; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr16 ; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 ; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v0, v6, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3475,6 +3475,10 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v15, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr16 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr11 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr4 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr15 @@ -3485,10 +3489,6 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr14 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr12 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 -; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr16 ; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 ; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v0, v6, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll index b7150a224db89d..dc4545bd82ae2d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll @@ -5,9 +5,9 @@ define float @test_fmamix_constant_bus_violation_sss(i32 inreg %val.0, i32 inreg ; CHECK-LABEL: test_fmamix_constant_bus_violation_sss: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_lshr_b32 s4, s6, 16 -; CHECK-NEXT: s_lshr_b32 s5, s7, 16 -; CHECK-NEXT: s_lshr_b32 s6, s16, 16 +; CHECK-NEXT: s_lshr_b32 s5, s17, 16 +; CHECK-NEXT: s_lshr_b32 s6, s18, 16 +; CHECK-NEXT: s_lshr_b32 s4, s16, 16 ; CHECK-NEXT: v_mov_b32_e32 v0, s5 ; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: v_fma_mix_f32 v0, s4, v0, v1 op_sel_hi:[1,1,1] @@ -32,8 +32,8 @@ define float @test_fmamix_constant_bus_violation_ssv(i32 inreg %val.0, i32 inreg ; CHECK-LABEL: test_fmamix_constant_bus_violation_ssv: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_lshr_b32 s5, s7, 16 -; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: s_lshr_b32 s5, s17, 16 +; CHECK-NEXT: s_lshr_b32 s4, s16, 16 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_fma_mix_f32 v0, s4, v1, v0 op_sel:[0,0,1] op_sel_hi:[1,1,1] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -57,8 +57,8 @@ define float @test_fmamix_constant_bus_violation_svs(i32 inreg %val.0, i32 %val. ; CHECK-LABEL: test_fmamix_constant_bus_violation_svs: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_lshr_b32 s5, s7, 16 -; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: s_lshr_b32 s5, s17, 16 +; CHECK-NEXT: s_lshr_b32 s4, s16, 16 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_fma_mix_f32 v0, s4, v0, v1 op_sel:[0,1,0] op_sel_hi:[1,1,1] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -82,8 +82,8 @@ define float @test_fmamix_constant_bus_violation_vss(i32 %val.0, i32 inreg %val. ; CHECK-LABEL: test_fmamix_constant_bus_violation_vss: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_lshr_b32 s5, s7, 16 -; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: s_lshr_b32 s5, s17, 16 +; CHECK-NEXT: s_lshr_b32 s4, s16, 16 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_fma_mix_f32 v0, v0, s4, v1 op_sel:[1,0,0] op_sel_hi:[1,1,1] ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 25c7fc6463c33d..676298670f1fa4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -20,7 +20,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 605c8f7e369194..8294dffc09b3cc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -18,24 +18,24 @@ declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -65,14 +65,14 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -80,14 +80,14 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -101,24 +101,24 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -148,14 +148,14 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -163,14 +163,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -184,24 +184,24 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -231,14 +231,14 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -246,14 +246,14 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -267,24 +267,24 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -314,14 +314,14 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -329,14 +329,14 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -350,24 +350,24 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -397,14 +397,14 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -412,14 +412,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -433,24 +433,24 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -480,14 +480,14 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -495,14 +495,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -516,24 +516,24 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -563,14 +563,14 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -578,14 +578,14 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -599,24 +599,24 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -646,14 +646,14 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -661,14 +661,14 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -682,24 +682,24 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -729,14 +729,14 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -744,14 +744,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -765,24 +765,24 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -812,14 +812,14 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -827,14 +827,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -848,24 +848,24 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -895,14 +895,14 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -910,14 +910,14 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -931,24 +931,24 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -978,14 +978,14 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -993,14 +993,14 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -1015,21 +1015,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB36_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1039,21 +1039,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB36_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB36_2: @@ -1067,20 +1067,20 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB37_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB37_2: @@ -1089,21 +1089,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB37_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB37_2: @@ -1117,21 +1117,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB38_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1141,21 +1141,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB38_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB38_2: @@ -1169,20 +1169,20 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB39_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB39_2: @@ -1191,21 +1191,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB39_2: @@ -1298,20 +1298,20 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB43_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB43_2: @@ -1320,21 +1320,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB43_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB43_2: @@ -1347,7 +1347,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1361,7 +1361,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1378,7 +1378,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1390,7 +1390,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1407,7 +1407,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1421,7 +1421,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1519,7 +1519,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1531,7 +1531,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1549,14 +1549,14 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB51_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 @@ -1570,14 +1570,14 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB51_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 @@ -1596,14 +1596,14 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB52_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 @@ -1617,14 +1617,14 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB52_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 @@ -1643,14 +1643,14 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 +; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB53_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 @@ -1664,14 +1664,14 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB53_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index 8409e9c88aadaa..e4e6c44b051c32 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -5,15 +5,15 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[6:7], 0x0 -; CI-NEXT: s_load_dword s0, s[0:1], 0x2 +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 +; CI-NEXT: s_load_dword s3, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -25,27 +25,27 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x8 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_rcp_f32_e32 v3, v2 ; VI-NEXT: v_mul_f32_e32 v4, v0, v3 ; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 @@ -58,8 +58,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 ; VI-NEXT: v_trunc_f16_e32 v0, v0 ; VI-NEXT: v_fma_f16 v2, -v0, v1, s2 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 @@ -73,39 +73,39 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: fast_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[6:7], 0x0 -; CI-NEXT: s_load_dword s0, s[0:1], 0x2 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 +; CI-NEXT: s_load_dword s3, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_rcp_f32_e32 v2, v1 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fast_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x8 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_rcp_f16_e32 v0, s0 +; VI-NEXT: v_rcp_f16_e32 v0, s3 ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 ; VI-NEXT: v_trunc_f16_e32 v0, v0 -; VI-NEXT: v_fma_f16 v2, -v0, s0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_fma_f16 v2, -v0, s3, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 @@ -119,39 +119,39 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { ; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[6:7], 0x0 -; CI-NEXT: s_load_dword s0, s[0:1], 0x2 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 +; CI-NEXT: s_load_dword s3, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_rcp_f32_e32 v2, v1 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: unsafe_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x8 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_rcp_f16_e32 v0, s0 +; VI-NEXT: v_rcp_f16_e32 v0, s3 ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 ; VI-NEXT: v_trunc_f16_e32 v0, v0 -; VI-NEXT: v_fma_f16 v2, -v0, s0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_fma_f16 v2, -v0, s3, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 @@ -165,15 +165,15 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[6:7], 0x0 -; CI-NEXT: s_load_dword s0, s[0:1], 0x4 +; CI-NEXT: s_load_dword s6, s[2:3], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2 -; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s6 +; CI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 ; CI-NEXT: v_rcp_f32_e32 v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 @@ -184,25 +184,25 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s2 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s6 ; CI-NEXT: v_trunc_f32_e32 v1, v1 -; CI-NEXT: v_fma_f32 v0, -v1, v0, s2 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: v_fma_f32 v0, -v1, v0, s6 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x10 +; VI-NEXT: s_load_dword s6, s[2:3], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2 -; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s6 +; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 @@ -213,11 +213,11 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2 +; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s6 ; VI-NEXT: v_trunc_f32_e32 v1, v1 -; VI-NEXT: v_fma_f32 v2, -v1, v0, s2 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_fma_f32 v2, -v1, v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 @@ -231,37 +231,37 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: fast_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[6:7], 0x0 -; CI-NEXT: s_load_dword s0, s[0:1], 0x4 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 +; CI-NEXT: s_load_dword s3, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 -; CI-NEXT: v_rcp_f32_e32 v0, s0 +; CI-NEXT: v_rcp_f32_e32 v0, s3 ; CI-NEXT: v_mul_f32_e32 v0, s2, v0 ; CI-NEXT: v_trunc_f32_e32 v0, v0 -; CI-NEXT: v_fma_f32 v0, -v0, s0, v1 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: v_fma_f32 v0, -v0, s3, v1 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fast_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x10 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_rcp_f32_e32 v0, s0 +; VI-NEXT: v_rcp_f32_e32 v0, s3 ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-NEXT: v_trunc_f32_e32 v0, v0 -; VI-NEXT: v_fma_f32 v2, -v0, s0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_fma_f32 v2, -v0, s3, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 @@ -275,37 +275,37 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { ; CI-LABEL: unsafe_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[6:7], 0x0 -; CI-NEXT: s_load_dword s0, s[0:1], 0x4 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 +; CI-NEXT: s_load_dword s3, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 -; CI-NEXT: v_rcp_f32_e32 v0, s0 +; CI-NEXT: v_rcp_f32_e32 v0, s3 ; CI-NEXT: v_mul_f32_e32 v0, s2, v0 ; CI-NEXT: v_trunc_f32_e32 v0, v0 -; CI-NEXT: v_fma_f32 v0, -v0, s0, v1 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: v_fma_f32 v0, -v0, s3, v1 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: unsafe_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x10 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_rcp_f32_e32 v0, s0 +; VI-NEXT: v_rcp_f32_e32 v0, s3 ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-NEXT: v_trunc_f32_e32 v0, v0 -; VI-NEXT: v_fma_f32 v2, -v0, s0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_fma_f32 v2, -v0, s3, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 @@ -319,17 +319,15 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3] +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3] ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3] ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -342,20 +340,22 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3] ; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] ; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3] -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3] ; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3] ; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -368,8 +368,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3] ; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] ; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3] -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %r0 = load double, ptr addrspace(1) %in1, align 8 @@ -382,51 +382,51 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: fast_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1] -; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0 +; CI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] +; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 ; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0 +; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 ; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] -; CI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3] +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] ; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] -; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3] -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fast_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1] -; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0 +; VI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] +; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 ; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0 +; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 ; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] -; VI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3] +; VI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] ; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] ; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] -; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3] -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %r0 = load double, ptr addrspace(1) %in1, align 8 @@ -439,51 +439,51 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; CI-LABEL: unsafe_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1] -; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0 +; CI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] +; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 ; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0 +; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 ; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] -; CI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3] +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] ; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] -; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3] -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: unsafe_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1] -; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0 +; VI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] +; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 ; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0 +; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 ; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] -; VI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3] +; VI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] ; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] ; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] -; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3] -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { @@ -497,17 +497,17 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[6:7], 0x0 -; CI-NEXT: s_load_dword s0, s[0:1], 0x4 +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 +; CI-NEXT: s_load_dword s3, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_lshr_b32 s6, s0, 16 -; CI-NEXT: s_lshr_b32 s3, s2, 16 -; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: s_lshr_b32 s5, s3, 16 +; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -522,11 +522,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v1 +; CI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1 ; CI-NEXT: v_rcp_f32_e32 v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -538,30 +538,30 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; CI-NEXT: v_trunc_f32_e32 v3, v3 ; CI-NEXT: v_fma_f32 v1, -v3, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x10 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; VI-NEXT: s_lshr_b32 s3, s0, 16 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_rcp_f32_e32 v3, v2 ; VI-NEXT: v_mul_f32_e32 v4, v0, v3 ; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 @@ -571,12 +571,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; VI-NEXT: v_add_f32_e32 v0, v0, v4 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 ; VI-NEXT: v_trunc_f16_e32 v0, v0 ; VI-NEXT: v_fma_f16 v0, -v0, v1, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; VI-NEXT: v_rcp_f32_e32 v4, v3 ; VI-NEXT: v_mul_f32_e32 v5, v1, v4 ; VI-NEXT: v_mad_f32 v6, -v3, v5, v1 @@ -586,13 +586,13 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; VI-NEXT: v_add_f32_e32 v1, v1, v5 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1 +; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s4 ; VI-NEXT: v_trunc_f16_e32 v1, v1 -; VI-NEXT: v_fma_f16 v1, -v1, v2, s1 +; VI-NEXT: v_fma_f16 v1, -v1, v2, s4 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 @@ -606,19 +606,19 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; CI-NEXT: s_lshr_b32 s8, s2, 16 ; CI-NEXT: s_lshr_b32 s9, s3, 16 -; CI-NEXT: s_lshr_b32 s10, s0, 16 +; CI-NEXT: s_lshr_b32 s10, s4, 16 ; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, v0 -; CI-NEXT: s_lshr_b32 s11, s1, 16 +; CI-NEXT: s_lshr_b32 s11, s5, 16 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -653,9 +653,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_trunc_f32_e32 v3, v3 ; CI-NEXT: v_fma_f32 v1, -v3, v2, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, v2 +; CI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, v2 ; CI-NEXT: v_div_scale_f32 v5, vcc, v2, v3, v2 ; CI-NEXT: v_rcp_f32_e32 v6, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -673,7 +673,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e32 v3, s9 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s11 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_div_scale_f32 v5, s[0:1], v4, v4, v3 +; CI-NEXT: v_div_scale_f32 v5, s[2:3], v4, v4, v3 ; CI-NEXT: v_div_scale_f32 v6, vcc, v3, v4, v3 ; CI-NEXT: v_rcp_f32_e32 v7, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -687,32 +687,32 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3 ; CI-NEXT: v_trunc_f32_e32 v5, v5 ; CI-NEXT: v_fma_f32 v3, -v5, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; CI-NEXT: v_or_b32_e32 v1, v2, v1 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; VI-NEXT: s_lshr_b32 s8, s0, 16 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s8, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_lshr_b32 s6, s2, 16 ; VI-NEXT: v_rcp_f32_e32 v3, v2 -; VI-NEXT: s_lshr_b32 s9, s1, 16 +; VI-NEXT: s_lshr_b32 s9, s5, 16 ; VI-NEXT: s_lshr_b32 s7, s3, 16 ; VI-NEXT: v_mul_f32_e32 v4, v0, v3 ; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 @@ -737,8 +737,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; VI-NEXT: v_add_f32_e32 v1, v1, v5 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6 ; VI-NEXT: v_trunc_f16_e32 v1, v1 ; VI-NEXT: v_fma_f16 v1, -v1, v2, s6 @@ -774,8 +774,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_fma_f16 v3, -v3, v4, s7 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 @@ -789,13 +789,13 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2 ; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 ; CI-NEXT: v_rcp_f32_e32 v3, v1 @@ -811,8 +811,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s2 ; CI-NEXT: v_trunc_f32_e32 v1, v1 ; CI-NEXT: v_fma_f32 v0, -v1, v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, s3 ; CI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -824,23 +824,23 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s3 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v1, -v2, v1, s3 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2 ; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 ; VI-NEXT: v_rcp_f32_e32 v3, v1 @@ -856,8 +856,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2 ; VI-NEXT: v_trunc_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v0, -v1, v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, s3 ; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3 ; VI-NEXT: v_rcp_f32_e32 v4, v2 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -872,8 +872,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3 ; VI-NEXT: v_trunc_f32_e32 v2, v2 ; VI-NEXT: v_fma_f32 v1, -v2, v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4 @@ -887,15 +887,15 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0 -; CI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0 +; CI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s4 +; CI-NEXT: v_div_scale_f32 v2, vcc, s4, v0, s4 ; CI-NEXT: v_rcp_f32_e32 v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 @@ -906,12 +906,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s0 +; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s4 ; CI-NEXT: v_trunc_f32_e32 v1, v1 -; CI-NEXT: v_fma_f32 v0, -v1, v0, s0 +; CI-NEXT: v_fma_f32 v0, -v1, v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, s1 -; CI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1 +; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s5 +; CI-NEXT: v_div_scale_f32 v3, vcc, s5, v1, s5 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 @@ -922,12 +922,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s1 +; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s5 ; CI-NEXT: v_trunc_f32_e32 v2, v2 -; CI-NEXT: v_fma_f32 v1, -v2, v1, s1 +; CI-NEXT: v_fma_f32 v1, -v2, v1, s5 ; CI-NEXT: v_mov_b32_e32 v2, s10 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, s2 -; CI-NEXT: v_div_scale_f32 v4, vcc, s2, v2, s2 +; CI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, s6 +; CI-NEXT: v_div_scale_f32 v4, vcc, s6, v2, s6 ; CI-NEXT: v_rcp_f32_e32 v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 @@ -938,12 +938,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; CI-NEXT: v_div_fixup_f32 v3, v3, v2, s2 +; CI-NEXT: v_div_fixup_f32 v3, v3, v2, s6 ; CI-NEXT: v_trunc_f32_e32 v3, v3 -; CI-NEXT: v_fma_f32 v2, -v3, v2, s2 +; CI-NEXT: v_fma_f32 v2, -v3, v2, s6 ; CI-NEXT: v_mov_b32_e32 v3, s11 -; CI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, s3 -; CI-NEXT: v_div_scale_f32 v5, vcc, s3, v3, s3 +; CI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, s7 +; CI-NEXT: v_div_scale_f32 v5, vcc, s7, v3, s7 ; CI-NEXT: v_rcp_f32_e32 v6, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0 @@ -954,25 +954,25 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_fma_f32 v4, -v4, v7, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: v_div_fixup_f32 v4, v4, v3, s3 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_div_fixup_f32 v4, v4, v3, s7 ; CI-NEXT: v_trunc_f32_e32 v4, v4 -; CI-NEXT: v_fma_f32 v3, -v4, v3, s3 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: v_fma_f32 v3, -v4, v3, s7 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0 -; VI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0 +; VI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s4 +; VI-NEXT: v_div_scale_f32 v2, vcc, s4, v0, s4 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 @@ -983,12 +983,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s0 +; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s4 ; VI-NEXT: v_trunc_f32_e32 v1, v1 -; VI-NEXT: v_fma_f32 v0, -v1, v0, s0 +; VI-NEXT: v_fma_f32 v0, -v1, v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, s1 -; VI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1 +; VI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s5 +; VI-NEXT: v_div_scale_f32 v3, vcc, s5, v1, s5 ; VI-NEXT: v_rcp_f32_e32 v4, v2 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 @@ -999,12 +999,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_fma_f32 v2, -v2, v5, v3 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s1 +; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s5 ; VI-NEXT: v_trunc_f32_e32 v2, v2 -; VI-NEXT: v_fma_f32 v1, -v2, v1, s1 +; VI-NEXT: v_fma_f32 v1, -v2, v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, s2 -; VI-NEXT: v_div_scale_f32 v4, vcc, s2, v2, s2 +; VI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, s6 +; VI-NEXT: v_div_scale_f32 v4, vcc, s6, v2, s6 ; VI-NEXT: v_rcp_f32_e32 v5, v3 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 @@ -1015,12 +1015,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_fma_f32 v3, -v3, v6, v4 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; VI-NEXT: v_div_fixup_f32 v3, v3, v2, s2 +; VI-NEXT: v_div_fixup_f32 v3, v3, v2, s6 ; VI-NEXT: v_trunc_f32_e32 v3, v3 -; VI-NEXT: v_fma_f32 v2, -v3, v2, s2 +; VI-NEXT: v_fma_f32 v2, -v3, v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, s3 -; VI-NEXT: v_div_scale_f32 v5, vcc, s3, v3, s3 +; VI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, s7 +; VI-NEXT: v_div_scale_f32 v5, vcc, s7, v3, s7 ; VI-NEXT: v_rcp_f32_e32 v6, v4 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v7, -v4, v6, 1.0 @@ -1031,11 +1031,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_fma_f32 v4, -v4, v7, v5 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 -; VI-NEXT: v_div_fixup_f32 v4, v4, v3, s3 +; VI-NEXT: v_div_fixup_f32 v4, v4, v3, s7 ; VI-NEXT: v_trunc_f32_e32 v4, v4 -; VI-NEXT: v_fma_f32 v3, -v4, v3, s3 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_fma_f32 v3, -v4, v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4 @@ -1049,18 +1049,16 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1] -; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5] +; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5] ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -1069,13 +1067,15 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] ; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] ; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1] +; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5] ; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] -; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] +; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5] ; CI-NEXT: v_mov_b32_e32 v2, s10 ; CI-NEXT: v_mov_b32_e32 v3, s11 -; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], s[2:3] -; CI-NEXT: v_div_scale_f64 v[10:11], vcc, s[2:3], v[2:3], s[2:3] +; CI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7] +; CI-NEXT: v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7] +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -1084,24 +1084,24 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] ; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] ; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] -; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[2:3] +; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7] ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; CI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[2:3] -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7] +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1] -; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] +; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5] +; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5] ; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -1110,13 +1110,13 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] ; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] ; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1] +; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5] ; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] -; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] +; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], s[2:3] -; VI-NEXT: v_div_scale_f64 v[10:11], vcc, s[2:3], v[2:3], s[2:3] +; VI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7] +; VI-NEXT: v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7] ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -1125,11 +1125,11 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] ; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] ; VI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] -; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[2:3] +; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7] ; VI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[2:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll index fe2e7afb7048ed..1f4330afb58d24 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll @@ -952,9 +952,9 @@ define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %ar ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile load (s8) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p1) :: (volatile load (s32) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CHECK-NEXT: %12:_(p5) = nuw nusw G_PTR_ADD [[COPY]], [[C]](s32) + ; CHECK-NEXT: %13:_(p5) = nuw nusw G_PTR_ADD [[COPY]], [[C]](s32) ; CHECK-NEXT: G_STORE [[LOAD]](s8), [[COPY]](p5) :: (store (s8) into %ir.arg0, addrspace 5) - ; CHECK-NEXT: G_STORE [[LOAD1]](s32), %12(p5) :: (store (s32) into %ir.gep1, addrspace 5) + ; CHECK-NEXT: G_STORE [[LOAD1]](s32), %13(p5) :: (store (s32) into %ir.gep1, addrspace 5) ; CHECK-NEXT: SI_RETURN %val0 = load volatile i8, ptr addrspace(1) undef %val1 = load volatile i32, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 9443b39dcdc033..54cb0777e9b2b7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -35,8 +35,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0xc8 ; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_mov_b32 s4, s0 @@ -83,7 +83,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 @@ -127,9 +127,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_shared: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc +; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 @@ -183,9 +183,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_private: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 @@ -208,7 +208,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 @@ -231,7 +231,7 @@ define amdgpu_kernel void @llvm_trap() { ; ; GFX8V5-LABEL: llvm_trap: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_trap 2 ; @@ -295,9 +295,11 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_add_u32 s0, s6, 8 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V5-NEXT: s_add_u32 s0, s8, 8 ; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc -; GFX8V5-NEXT: s_addc_u32 s1, s7, 0 +; GFX8V5-NEXT: s_addc_u32 s1, s9, 0 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 @@ -306,10 +308,10 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) -; GFX8V5-NEXT: v_mov_b32_e32 v0, s8 -; GFX8V5-NEXT: v_mov_b32_e32 v1, s9 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s10 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s11 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: v_mov_b32_e32 v3, s1 ; GFX8V5-NEXT: v_mov_b32_e32 v2, s0 @@ -337,13 +339,14 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; GFX9V5-LABEL: llvm_amdgcn_queue_ptr: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: v_mov_b32_e32 v2, 0 -; GFX9V5-NEXT: global_load_ubyte v0, v[0:1], off glc -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc ; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) -; GFX9V5-NEXT: v_mov_b32_e32 v0, s8 -; GFX9V5-NEXT: v_mov_b32_e32 v1, s9 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V5-NEXT: ; kill: killed $sgpr6_sgpr7 ; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll index 69567b34ae6e60..0b0c7b7df25703 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll @@ -22,9 +22,9 @@ define amdgpu_kernel void @return_type_is_too_big_vector() { ; CHECK-LABEL: name: return_type_is_too_big_vector ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1 (%ir-block.0): %sgpr = call <4 x i32> asm sideeffect "; def $0", "={s[8:12]}" () @@ -37,9 +37,9 @@ define amdgpu_kernel void @return_type_is_too_small_vector() { ; CHECK-LABEL: name: return_type_is_too_small_vector ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1 (%ir-block.0): %sgpr = call <4 x i32> asm sideeffect "; def $0", "={s[8:10]}" () diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index 4fcde0f2fc7cf1..c3b48b5d2ddff5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -7,9 +7,9 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr addrspace(1) %ptr, i32 %val, i32 %idx) #0 { ; GCN-LABEL: v_insert_v64i32_varidx: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[20:23], s[6:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[24:25], s[6:7], 0x10 -; GCN-NEXT: s_add_u32 s0, s0, s13 +; GCN-NEXT: s_load_dwordx4 s[20:23], s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[24:25], s[8:9], 0x10 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v64, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll index 8ff1e1d8d072d6..2971049c2a54a6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addrspace(1) %ptr.out) #0 { ; GCN-LABEL: v_insert_v64i32_37: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] @@ -53,7 +53,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addr ; ; GFX10-LABEL: v_insert_v64i32_37: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0xf @@ -101,7 +101,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addr ; ; GFX11-LABEL: v_insert_v64i32_37: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll index 652d22ac1224fc..4c35d7bcbae2da 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll @@ -5,9 +5,9 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind { ; HSA-VI-LABEL: name: i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -20,9 +20,9 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; LEGACY-MESA-VI-LABEL: name: i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -40,9 +40,9 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind { ; HSA-VI-LABEL: name: i8_zext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -55,9 +55,9 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; LEGACY-MESA-VI-LABEL: name: i8_zext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -75,9 +75,9 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind { ; HSA-VI-LABEL: name: i8_sext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -90,9 +90,9 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; LEGACY-MESA-VI-LABEL: name: i8_sext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -110,9 +110,9 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind { ; HSA-VI-LABEL: name: i16_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -125,9 +125,9 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; LEGACY-MESA-VI-LABEL: name: i16_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -145,9 +145,9 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind { ; HSA-VI-LABEL: name: i16_zext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -160,9 +160,9 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; LEGACY-MESA-VI-LABEL: name: i16_zext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -180,9 +180,9 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind { ; HSA-VI-LABEL: name: i16_sext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -195,9 +195,9 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; LEGACY-MESA-VI-LABEL: name: i16_sext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -215,9 +215,9 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind { ; HSA-VI-LABEL: name: i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -229,9 +229,9 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou ; ; LEGACY-MESA-VI-LABEL: name: i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -248,9 +248,9 @@ entry: define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind { ; HSA-VI-LABEL: name: f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -262,9 +262,9 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n ; ; LEGACY-MESA-VI-LABEL: name: f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -281,9 +281,9 @@ entry: define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; HSA-VI-LABEL: name: v2i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -295,9 +295,9 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v2i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -314,9 +314,9 @@ entry: define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; HSA-VI-LABEL: name: v2i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -328,9 +328,9 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v2i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -347,9 +347,9 @@ entry: define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v2i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -361,9 +361,9 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v2i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -380,9 +380,9 @@ entry: define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind { ; HSA-VI-LABEL: name: v2f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -394,9 +394,9 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; LEGACY-MESA-VI-LABEL: name: v2f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -413,9 +413,9 @@ entry: define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { ; HSA-VI-LABEL: name: v3i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -427,9 +427,9 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i ; ; LEGACY-MESA-VI-LABEL: name: v3i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -446,9 +446,9 @@ entry: define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { ; HSA-VI-LABEL: name: v3i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -460,9 +460,9 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; LEGACY-MESA-VI-LABEL: name: v3i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -479,9 +479,9 @@ entry: define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v3i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -493,9 +493,9 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v3i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -512,9 +512,9 @@ entry: define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { ; HSA-VI-LABEL: name: v3f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -526,9 +526,9 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; LEGACY-MESA-VI-LABEL: name: v3f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -545,9 +545,9 @@ entry: define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; HSA-VI-LABEL: name: v4i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -559,9 +559,9 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v4i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -578,9 +578,9 @@ entry: define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; HSA-VI-LABEL: name: v4i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -592,9 +592,9 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v4i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -611,9 +611,9 @@ entry: define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v4i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -625,9 +625,9 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v4i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -644,9 +644,9 @@ entry: define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind { ; HSA-VI-LABEL: name: v4f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -658,9 +658,9 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; LEGACY-MESA-VI-LABEL: name: v4f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -677,9 +677,9 @@ entry: define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; HSA-VI-LABEL: name: v8i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -691,9 +691,9 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v8i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -710,9 +710,9 @@ entry: define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; HSA-VI-LABEL: name: v8i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -724,9 +724,9 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v8i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -743,9 +743,9 @@ entry: define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v8i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -757,9 +757,9 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v8i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -776,9 +776,9 @@ entry: define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind { ; HSA-VI-LABEL: name: v8f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -790,9 +790,9 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; LEGACY-MESA-VI-LABEL: name: v8f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -809,9 +809,9 @@ entry: define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; HSA-VI-LABEL: name: v16i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -823,9 +823,9 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v16i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -842,9 +842,9 @@ entry: define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; HSA-VI-LABEL: name: v16i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -856,9 +856,9 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v16i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -875,9 +875,9 @@ entry: define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v16i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -889,9 +889,9 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; LEGACY-MESA-VI-LABEL: name: v16i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -908,9 +908,9 @@ entry: define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind { ; HSA-VI-LABEL: name: v16f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -922,9 +922,9 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; LEGACY-MESA-VI-LABEL: name: v16f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -941,9 +941,9 @@ entry: define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind { ; HSA-VI-LABEL: name: kernel_arg_i64 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -955,9 +955,9 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; LEGACY-MESA-VI-LABEL: name: kernel_arg_i64 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -973,9 +973,9 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; HSA-VI-LABEL: name: f64_kernel_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -987,9 +987,9 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; LEGACY-MESA-VI-LABEL: name: f64_kernel_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1006,9 +1006,9 @@ entry: define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1020,9 +1020,9 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: i1_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1038,9 +1038,9 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_zext_i32 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1053,9 +1053,9 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i32 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1073,9 +1073,9 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_zext_i64 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1088,9 +1088,9 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i64 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1108,9 +1108,9 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_sext_i32 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1123,9 +1123,9 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i32 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1143,9 +1143,9 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_sext_i64 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1158,9 +1158,9 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i64 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1180,9 +1180,9 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind { ; HSA-VI-LABEL: name: empty_struct_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1192,9 +1192,9 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: empty_struct_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1208,9 +1208,9 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind { define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind { ; HSA-VI-LABEL: name: empty_array_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1220,9 +1220,9 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: empty_array_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1244,9 +1244,9 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind { define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad, {i32, i64} %arg1) { ; HSA-VI-LABEL: name: struct_argument_alignment ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1272,9 +1272,9 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad, ; ; LEGACY-MESA-VI-LABEL: name: struct_argument_alignment ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1312,9 +1312,9 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad, define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr addrspace(1)} %arg0, i8 %pad, {ptr addrspace(3), ptr addrspace(1234)} %arg1) { ; HSA-VI-LABEL: name: pointer_in_struct_argument ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1340,9 +1340,9 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add ; ; LEGACY-MESA-VI-LABEL: name: pointer_in_struct_argument ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1382,9 +1382,9 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { ; HSA-VI-LABEL: name: packed_struct_argument_alignment ; HSA-VI: bb.1 (%ir-block.1): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1406,9 +1406,9 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; ; LEGACY-MESA-VI-LABEL: name: packed_struct_argument_alignment ; LEGACY-MESA-VI: bb.1 (%ir-block.1): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1441,16 +1441,16 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, define amdgpu_kernel void @unused_i32_arg(ptr addrspace(1) nocapture %out, i32 %unused, i32 %in) nounwind { ; HSA-VI-LABEL: name: unused_i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: S_ENDPGM 0 ; ; LEGACY-MESA-VI-LABEL: name: unused_i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: S_ENDPGM 0 entry: ret void @@ -1460,9 +1460,9 @@ entry: define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) { ; HSA-VI-LABEL: name: byref_constant_i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1475,9 +1475,9 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1496,9 +1496,9 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) align 2 %in.byref) { ; HSA-VI-LABEL: name: byref_constant_i16_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1511,9 +1511,9 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i16_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1532,9 +1532,9 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align 4 %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_constant_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1550,9 +1550,9 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1574,9 +1574,9 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) align(16) %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_constant_v4i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1592,9 +1592,9 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture % ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_v4i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1616,9 +1616,9 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture % define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_align_constant_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1634,9 +1634,9 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; LEGACY-MESA-VI-LABEL: name: byref_align_constant_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1658,9 +1658,9 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg ; HSA-VI: bb.1 (%ir-block.1): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1676,9 +1676,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; ; LEGACY-MESA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.1): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1701,9 +1701,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_global_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1716,9 +1716,9 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ; ; LEGACY-MESA-VI-LABEL: name: byref_global_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1736,9 +1736,9 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_flat_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1751,9 +1751,9 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p ; ; LEGACY-MESA-VI-LABEL: name: byref_flat_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1771,9 +1771,9 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_constant_32bit_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1786,9 +1786,9 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_32bit_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1806,9 +1806,9 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(999) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_unknown_as_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1821,9 +1821,9 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture % ; ; LEGACY-MESA-VI-LABEL: name: byref_unknown_as_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1842,9 +1842,9 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture % define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(3) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_local_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1857,9 +1857,9 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, ; ; LEGACY-MESA-VI-LABEL: name: byref_local_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1877,9 +1877,9 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(4) %in0.byref, ptr addrspace(4) byref(i32) align(4) %in1.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: multi_byref_constant_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1899,9 +1899,9 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu ; ; LEGACY-MESA-VI-LABEL: name: multi_byref_constant_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1929,9 +1929,9 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_constant_i32_arg_offset0 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -1941,9 +1941,9 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg_offset0 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -1958,9 +1958,9 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind { ; HSA-VI-LABEL: name: p3i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p3), align 16, addrspace 4) @@ -1970,9 +1970,9 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: p3i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p3), addrspace 4) @@ -1986,9 +1986,9 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind { define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind { ; HSA-VI-LABEL: name: p1i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 9 ; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 0 ; HSA-VI-NEXT: G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, addrspace 3) @@ -1996,9 +1996,9 @@ define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: p1i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 9 ; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 0 ; LEGACY-MESA-VI-NEXT: G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, addrspace 3) @@ -2010,9 +2010,9 @@ define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind { define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind { ; HSA-VI-LABEL: name: v2p1i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p1>), addrspace 4) @@ -2022,9 +2022,9 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: v2p1i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p1>), align 4, addrspace 4) @@ -2038,9 +2038,9 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind { define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind { ; HSA-VI-LABEL: name: v2p3i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p3>), align 16, addrspace 4) @@ -2050,9 +2050,9 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: v2p3i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p3>), align 4, addrspace 4) @@ -2066,9 +2066,9 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind { define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x ptr addrspace(3)> } %arg) nounwind { ; HSA-VI-LABEL: name: v2p1i8_in_struct_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr8_sgpr9 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x s64>), addrspace 4) @@ -2084,9 +2084,9 @@ define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x p ; ; LEGACY-MESA-VI-LABEL: name: v2p1i8_in_struct_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll index 3150f8cac12846..fc3eb19e985240 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll @@ -32,13 +32,13 @@ define void @call_result_align_1() { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @returns_ptr ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -81,13 +81,13 @@ define void @call_result_align_8() { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @returns_ptr ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -131,13 +131,13 @@ define void @declaration_result_align_8() { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @returns_ptr_align8 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -181,11 +181,11 @@ define ptr addrspace(1) @tail_call_assert_align() { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @returns_ptr_align8 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll index d4a7f3b2d387d0..be0c9e2a602faf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll @@ -30,8 +30,8 @@ define float @test_atomicrmw_fsub(ptr addrspace(3) %addr) { ; CHECK-NEXT: bb.2.atomicrmw.start: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %15(s64), %bb.2, [[C1]](s64), %bb.1 - ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %13(s32), %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %16(s64), %bb.2, [[C1]](s64), %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %14(s32), %bb.2 ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[PHI1]], [[C]] ; CHECK-NEXT: [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[PHI1]], [[FSUB]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3) ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) @@ -80,8 +80,8 @@ define <2 x half> @test_atomicrmw_fsub_vector(ptr addrspace(3) %addr) { ; CHECK-NEXT: bb.2.atomicrmw.start: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %19(s64), %bb.2, [[C1]](s64), %bb.1 - ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %18(<2 x s16>), %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %20(s64), %bb.2, [[C1]](s64), %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %19(<2 x s16>), %bb.2 ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(<2 x s16>) = G_FSUB [[PHI1]], [[BUILD_VECTOR]] ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[FSUB]](<2 x s16>) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>) @@ -118,8 +118,8 @@ define <2 x half> @test_atomicrmw_fmin_vector(ptr addrspace(3) %addr) { ; CHECK-NEXT: bb.2.atomicrmw.start: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %19(s64), %bb.2, [[C1]](s64), %bb.1 - ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %18(<2 x s16>), %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %20(s64), %bb.2, [[C1]](s64), %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %19(<2 x s16>), %bb.2 ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM [[PHI1]], [[BUILD_VECTOR]] ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[FMINNUM]](<2 x s16>) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>) @@ -156,8 +156,8 @@ define <2 x half> @test_atomicrmw_fmax_vector(ptr addrspace(3) %addr) { ; CHECK-NEXT: bb.2.atomicrmw.start: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %19(s64), %bb.2, [[C1]](s64), %bb.1 - ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %18(<2 x s16>), %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %20(s64), %bb.2, [[C1]](s64), %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %19(<2 x s16>), %bb.2 ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM [[PHI1]], [[BUILD_VECTOR]] ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM]](<2 x s16>) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll index ca33eae148819f..31c08a3479bb38 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll @@ -9,36 +9,37 @@ declare hidden void @extern() define amdgpu_kernel void @kernel_call_no_workitem_ids() { ; CHECK-LABEL: name: kernel_call_no_workitem_ids ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY4]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]](p4) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY7]], [[C]](s64) - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY [[COPY3]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY6]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C]](s64) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY3]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY8]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY8]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY9]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY10]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY11]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY11]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY12]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY13]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @extern, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0 @@ -49,39 +50,40 @@ define amdgpu_kernel void @kernel_call_no_workitem_ids() { define amdgpu_kernel void @kernel_call_no_workgroup_ids() { ; CHECK-LABEL: name: kernel_call_no_workgroup_ids ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY4]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]](p4) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY7]], [[C]](s64) - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C]](s64) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY9]], [[SHL]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY12]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY11]], [[SHL]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY6]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY8]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY8]](s64) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @extern, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -93,12 +95,12 @@ define amdgpu_kernel void @kernel_call_no_workgroup_ids() { define amdgpu_kernel void @kernel_call_no_other_sgprs() { ; CHECK-LABEL: name: kernel_call_no_other_sgprs ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(p4) = COPY [[COPY3]](p4) @@ -137,12 +139,12 @@ define void @func_call_no_workitem_ids() { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] @@ -175,12 +177,12 @@ define void @func_call_no_workgroup_ids() { ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY4]](p4) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY4]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY3]] ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY2]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll index a5f59b15c11b84..6e85ccbafd5355 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll @@ -11,51 +11,52 @@ declare hidden void @external_void_func_v32i32(<32 x i32>) #0 define amdgpu_kernel void @test_call_external_void_func_i32([17 x i8]) #0 { ; GFX900-LABEL: name: test_call_external_void_func_i32 ; GFX900: bb.1 (%ir-block.1): - ; GFX900-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX900-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GFX900-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GFX900-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX900-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX900-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX900-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX900-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; GFX900-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX900-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GFX900-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 20 - ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX900-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX900-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GFX900-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GFX900-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; GFX900-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GFX900-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; GFX900-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GFX900-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX900-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GFX900-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; GFX900-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; GFX900-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; GFX900-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX900-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX900-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GFX900-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GFX900-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GFX900-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GFX900-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GFX900-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GFX900-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GFX900-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GFX900-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GFX900-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX900-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GFX900-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX900-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -63,51 +64,52 @@ define amdgpu_kernel void @test_call_external_void_func_i32([17 x i8]) #0 { ; ; GFX908-LABEL: name: test_call_external_void_func_i32 ; GFX908: bb.1 (%ir-block.1): - ; GFX908-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX908-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX908-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX908-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; GFX908-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX908-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GFX908-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 20 - ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX908-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX908-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GFX908-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GFX908-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; GFX908-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GFX908-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; GFX908-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GFX908-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX908-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GFX908-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; GFX908-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; GFX908-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; GFX908-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX908-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX908-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GFX908-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GFX908-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GFX908-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GFX908-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GFX908-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GFX908-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GFX908-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GFX908-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GFX908-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX908-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GFX908-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX908-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -128,13 +130,13 @@ define void @test_func_call_external_void_func_i32() #0 { ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX900-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; GFX900-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GFX900-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 99 ; GFX900-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX900-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -169,13 +171,13 @@ define void @test_func_call_external_void_func_i32() #0 { ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GFX908-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 99 ; GFX908-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX908-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -207,40 +209,41 @@ define void @test_func_call_external_void_func_i32() #0 { define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 { ; GFX900-LABEL: name: test_call_external_void_func_v32i32 ; GFX900: bb.1 (%ir-block.1): - ; GFX900-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX900-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GFX900-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GFX900-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX900-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX900-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX900-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX900-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) ; GFX900-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; GFX900-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX900-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32 - ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GFX900-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 20 - ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX900-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX900-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GFX900-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GFX900-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; GFX900-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GFX900-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; GFX900-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GFX900-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX900-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GFX900-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; GFX900-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; GFX900-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; GFX900-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>) ; GFX900-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg @@ -278,16 +281,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 { ; GFX900-NEXT: $vgpr28 = COPY [[UV28]](s32) ; GFX900-NEXT: $vgpr29 = COPY [[UV29]](s32) ; GFX900-NEXT: $vgpr30 = COPY [[UV30]](s32) - ; GFX900-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX900-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GFX900-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GFX900-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GFX900-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GFX900-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GFX900-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GFX900-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GFX900-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GFX900-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GFX900-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX900-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GFX900-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v32i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX900-NEXT: ADJCALLSTACKDOWN 0, 4, implicit-def $scc @@ -295,40 +298,41 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 { ; ; GFX908-LABEL: name: test_call_external_void_func_v32i32 ; GFX908: bb.1 (%ir-block.1): - ; GFX908-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX908-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX908-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX908-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) ; GFX908-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; GFX908-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX908-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32 - ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GFX908-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 20 - ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX908-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX908-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GFX908-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GFX908-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; GFX908-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GFX908-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; GFX908-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GFX908-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX908-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GFX908-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; GFX908-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; GFX908-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; GFX908-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<32 x s32>) ; GFX908-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg @@ -366,16 +370,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32([17 x i8]) #0 { ; GFX908-NEXT: $vgpr28 = COPY [[UV28]](s32) ; GFX908-NEXT: $vgpr29 = COPY [[UV29]](s32) ; GFX908-NEXT: $vgpr30 = COPY [[UV30]](s32) - ; GFX908-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX908-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GFX908-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GFX908-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GFX908-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GFX908-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GFX908-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GFX908-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GFX908-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GFX908-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GFX908-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX908-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GFX908-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v32i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX908-NEXT: ADJCALLSTACKDOWN 0, 4, implicit-def $scc @@ -396,7 +400,7 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 { ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX900-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; GFX900-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX900-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) @@ -454,7 +458,7 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 { ; GFX900-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX900-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32 ; GFX900-NEXT: [[COPY26:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; GFX900-NEXT: [[COPY27:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; GFX900-NEXT: [[COPY27:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; GFX900-NEXT: [[COPY28:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; GFX900-NEXT: [[COPY29:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; GFX900-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -524,7 +528,7 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 { ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; GFX908-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX908-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) @@ -582,7 +586,7 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 { ; GFX908-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX908-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32 ; GFX908-NEXT: [[COPY26:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; GFX908-NEXT: [[COPY27:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; GFX908-NEXT: [[COPY27:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; GFX908-NEXT: [[COPY28:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; GFX908-NEXT: [[COPY29:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; GFX908-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -647,82 +651,84 @@ define void @test_func_call_external_void_func_v32i32([17 x i8]) #0 { define amdgpu_kernel void @test_only_workitem_id_x() #0 !reqd_work_group_size !0 { ; GFX900-LABEL: name: test_only_workitem_id_x ; GFX900: bb.1 (%ir-block.0): - ; GFX900-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX900-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX900-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX900-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX900-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX900-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX900-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX900-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX900-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX900-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]] - ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]] + ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) ; GFX900-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY8]], [[C1]](s64) - ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY4]] - ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]] - ; GFX900-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) + ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]] + ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX900-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>) - ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4) - ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>) + ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]](p4) + ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[COPY9]](p4) ; GFX900-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY9]](s64) - ; GFX900-NEXT: $sgpr12 = COPY [[COPY10]](s32) - ; GFX900-NEXT: $sgpr13 = COPY [[COPY11]](s32) - ; GFX900-NEXT: $sgpr14 = COPY [[COPY12]](s32) - ; GFX900-NEXT: $sgpr15 = COPY [[DEF1]](s32) - ; GFX900-NEXT: $vgpr31 = COPY [[COPY13]](s32) + ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) + ; GFX900-NEXT: $sgpr12 = COPY [[COPY12]](s32) + ; GFX900-NEXT: $sgpr13 = COPY [[COPY13]](s32) + ; GFX900-NEXT: $sgpr14 = COPY [[COPY14]](s32) + ; GFX900-NEXT: $sgpr15 = COPY [[DEF]](s32) + ; GFX900-NEXT: $vgpr31 = COPY [[COPY15]](s32) ; GFX900-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX900-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GFX900-NEXT: S_ENDPGM 0 ; ; GFX908-LABEL: name: test_only_workitem_id_x ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX908-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX908-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX908-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX908-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX908-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]] - ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]] + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) ; GFX908-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY8]], [[C1]](s64) - ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY4]] - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]] - ; GFX908-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) + ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]] + ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX908-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>) - ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4) - ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>) + ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]](p4) + ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[COPY9]](p4) ; GFX908-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY9]](s64) - ; GFX908-NEXT: $sgpr12 = COPY [[COPY10]](s32) - ; GFX908-NEXT: $sgpr13 = COPY [[COPY11]](s32) - ; GFX908-NEXT: $sgpr14 = COPY [[COPY12]](s32) - ; GFX908-NEXT: $sgpr15 = COPY [[DEF1]](s32) - ; GFX908-NEXT: $vgpr31 = COPY [[COPY13]](s32) + ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) + ; GFX908-NEXT: $sgpr12 = COPY [[COPY12]](s32) + ; GFX908-NEXT: $sgpr13 = COPY [[COPY13]](s32) + ; GFX908-NEXT: $sgpr14 = COPY [[COPY14]](s32) + ; GFX908-NEXT: $sgpr15 = COPY [[DEF]](s32) + ; GFX908-NEXT: $vgpr31 = COPY [[COPY15]](s32) ; GFX908-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX908-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GFX908-NEXT: S_ENDPGM 0 @@ -733,44 +739,45 @@ define amdgpu_kernel void @test_only_workitem_id_x() #0 !reqd_work_group_size !0 define amdgpu_kernel void @test_only_workitem_id_y() #0 !reqd_work_group_size !1 { ; GFX900-LABEL: name: test_only_workitem_id_y ; GFX900: bb.1 (%ir-block.0): - ; GFX900-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX900-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 - ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX900-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX900-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX900-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX900-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX900-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX900-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX900-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX900-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]] - ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]] + ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) ; GFX900-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY8]], [[C1]](s64) - ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY4]] - ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]] - ; GFX900-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) + ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]] + ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX900-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX900-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C3]](s32) + ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C3]](s32) ; GFX900-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]] ; GFX900-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>) - ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4) - ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>) + ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]](p4) + ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[COPY9]](p4) ; GFX900-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY9]](s64) - ; GFX900-NEXT: $sgpr12 = COPY [[COPY10]](s32) - ; GFX900-NEXT: $sgpr13 = COPY [[COPY11]](s32) - ; GFX900-NEXT: $sgpr14 = COPY [[COPY12]](s32) - ; GFX900-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) + ; GFX900-NEXT: $sgpr12 = COPY [[COPY12]](s32) + ; GFX900-NEXT: $sgpr13 = COPY [[COPY13]](s32) + ; GFX900-NEXT: $sgpr14 = COPY [[COPY14]](s32) + ; GFX900-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX900-NEXT: $vgpr31 = COPY [[OR]](s32) ; GFX900-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX900-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -778,44 +785,45 @@ define amdgpu_kernel void @test_only_workitem_id_y() #0 !reqd_work_group_size !1 ; ; GFX908-LABEL: name: test_only_workitem_id_y ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX908-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 - ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX908-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX908-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX908-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX908-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]] - ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]] + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) ; GFX908-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY8]], [[C1]](s64) - ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY4]] - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]] - ; GFX908-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) + ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]] + ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX908-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX908-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C3]](s32) + ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C3]](s32) ; GFX908-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]] ; GFX908-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>) - ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4) - ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>) + ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]](p4) + ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[COPY9]](p4) ; GFX908-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY9]](s64) - ; GFX908-NEXT: $sgpr12 = COPY [[COPY10]](s32) - ; GFX908-NEXT: $sgpr13 = COPY [[COPY11]](s32) - ; GFX908-NEXT: $sgpr14 = COPY [[COPY12]](s32) - ; GFX908-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) + ; GFX908-NEXT: $sgpr12 = COPY [[COPY12]](s32) + ; GFX908-NEXT: $sgpr13 = COPY [[COPY13]](s32) + ; GFX908-NEXT: $sgpr14 = COPY [[COPY14]](s32) + ; GFX908-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX908-NEXT: $vgpr31 = COPY [[OR]](s32) ; GFX908-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX908-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -827,44 +835,45 @@ define amdgpu_kernel void @test_only_workitem_id_y() #0 !reqd_work_group_size !1 define amdgpu_kernel void @test_only_workitem_id_z() #0 !reqd_work_group_size !2 { ; GFX900-LABEL: name: test_only_workitem_id_z ; GFX900: bb.1 (%ir-block.0): - ; GFX900-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX900-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 - ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX900-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX900-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX900-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX900-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX900-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX900-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX900-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX900-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]] - ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]] + ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) ; GFX900-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY8]], [[C1]](s64) - ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY4]] - ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]] - ; GFX900-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) + ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]] + ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX900-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX900-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C3]](s32) + ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C3]](s32) ; GFX900-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]] ; GFX900-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>) - ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4) - ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>) + ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]](p4) + ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[COPY9]](p4) ; GFX900-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY9]](s64) - ; GFX900-NEXT: $sgpr12 = COPY [[COPY10]](s32) - ; GFX900-NEXT: $sgpr13 = COPY [[COPY11]](s32) - ; GFX900-NEXT: $sgpr14 = COPY [[COPY12]](s32) - ; GFX900-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) + ; GFX900-NEXT: $sgpr12 = COPY [[COPY12]](s32) + ; GFX900-NEXT: $sgpr13 = COPY [[COPY13]](s32) + ; GFX900-NEXT: $sgpr14 = COPY [[COPY14]](s32) + ; GFX900-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX900-NEXT: $vgpr31 = COPY [[OR]](s32) ; GFX900-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX900-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -872,44 +881,45 @@ define amdgpu_kernel void @test_only_workitem_id_z() #0 !reqd_work_group_size !2 ; ; GFX908-LABEL: name: test_only_workitem_id_z ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX908-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 - ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX908-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX908-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX908-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX908-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]] - ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY5]] + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) ; GFX908-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY8]], [[C1]](s64) - ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(s64) = COPY [[COPY4]] - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]] - ; GFX908-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) + ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY4]] + ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX908-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX908-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C3]](s32) + ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C3]](s32) ; GFX908-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]] ; GFX908-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>) - ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4) - ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>) + ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]](p4) + ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[COPY9]](p4) ; GFX908-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY9]](s64) - ; GFX908-NEXT: $sgpr12 = COPY [[COPY10]](s32) - ; GFX908-NEXT: $sgpr13 = COPY [[COPY11]](s32) - ; GFX908-NEXT: $sgpr14 = COPY [[COPY12]](s32) - ; GFX908-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) + ; GFX908-NEXT: $sgpr12 = COPY [[COPY12]](s32) + ; GFX908-NEXT: $sgpr13 = COPY [[COPY13]](s32) + ; GFX908-NEXT: $sgpr14 = COPY [[COPY14]](s32) + ; GFX908-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX908-NEXT: $vgpr31 = COPY [[OR]](s32) ; GFX908-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX908-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -921,45 +931,46 @@ define amdgpu_kernel void @test_only_workitem_id_z() #0 !reqd_work_group_size !2 define amdgpu_kernel void @test_only_workitem_id_xy() #0 !reqd_work_group_size !3 { ; GFX900-LABEL: name: test_only_workitem_id_xy ; GFX900: bb.1 (%ir-block.0): - ; GFX900-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX900-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GFX900-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GFX900-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX900-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX900-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX900-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX900-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GFX900-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C1]](s64) - ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GFX900-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY11]], [[C1]](s64) + ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX900-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX900-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C2]](s32) - ; GFX900-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL]] + ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GFX900-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY16]], [[SHL]] ; GFX900-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>) - ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]](p4) - ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX900-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) + ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) + ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[COPY10]](p4) ; GFX900-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64) - ; GFX900-NEXT: $sgpr12 = COPY [[COPY11]](s32) - ; GFX900-NEXT: $sgpr13 = COPY [[COPY12]](s32) - ; GFX900-NEXT: $sgpr14 = COPY [[COPY13]](s32) - ; GFX900-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY12]](s64) + ; GFX900-NEXT: $sgpr12 = COPY [[COPY13]](s32) + ; GFX900-NEXT: $sgpr13 = COPY [[COPY14]](s32) + ; GFX900-NEXT: $sgpr14 = COPY [[COPY15]](s32) + ; GFX900-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX900-NEXT: $vgpr31 = COPY [[OR]](s32) ; GFX900-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX900-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -967,45 +978,46 @@ define amdgpu_kernel void @test_only_workitem_id_xy() #0 !reqd_work_group_size ! ; ; GFX908-LABEL: name: test_only_workitem_id_xy ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX908-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX908-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX908-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX908-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GFX908-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C1]](s64) - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GFX908-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY11]], [[C1]](s64) + ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX908-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX908-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C2]](s32) - ; GFX908-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL]] + ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GFX908-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY16]], [[SHL]] ; GFX908-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>) - ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]](p4) - ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX908-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) + ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) + ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[COPY10]](p4) ; GFX908-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64) - ; GFX908-NEXT: $sgpr12 = COPY [[COPY11]](s32) - ; GFX908-NEXT: $sgpr13 = COPY [[COPY12]](s32) - ; GFX908-NEXT: $sgpr14 = COPY [[COPY13]](s32) - ; GFX908-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY12]](s64) + ; GFX908-NEXT: $sgpr12 = COPY [[COPY13]](s32) + ; GFX908-NEXT: $sgpr13 = COPY [[COPY14]](s32) + ; GFX908-NEXT: $sgpr14 = COPY [[COPY15]](s32) + ; GFX908-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX908-NEXT: $vgpr31 = COPY [[OR]](s32) ; GFX908-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX908-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1017,49 +1029,50 @@ define amdgpu_kernel void @test_only_workitem_id_xy() #0 !reqd_work_group_size ! define amdgpu_kernel void @test_only_workitem_id_yz() #0 !reqd_work_group_size !4 { ; GFX900-LABEL: name: test_only_workitem_id_yz ; GFX900: bb.1 (%ir-block.0): - ; GFX900-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX900-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GFX900-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 - ; GFX900-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX900-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX900-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX900-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX900-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GFX900-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C1]](s64) - ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GFX900-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY11]], [[C1]](s64) + ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX900-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GFX900-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY14]], [[C3]](s32) + ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32) ; GFX900-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]] - ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX900-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX900-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GFX900-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C4]](s32) + ; GFX900-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32) ; GFX900-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; GFX900-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>) - ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]](p4) - ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX900-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) + ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) + ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[COPY10]](p4) ; GFX900-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64) - ; GFX900-NEXT: $sgpr12 = COPY [[COPY11]](s32) - ; GFX900-NEXT: $sgpr13 = COPY [[COPY12]](s32) - ; GFX900-NEXT: $sgpr14 = COPY [[COPY13]](s32) - ; GFX900-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY12]](s64) + ; GFX900-NEXT: $sgpr12 = COPY [[COPY13]](s32) + ; GFX900-NEXT: $sgpr13 = COPY [[COPY14]](s32) + ; GFX900-NEXT: $sgpr14 = COPY [[COPY15]](s32) + ; GFX900-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX900-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GFX900-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX900-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1067,49 +1080,50 @@ define amdgpu_kernel void @test_only_workitem_id_yz() #0 !reqd_work_group_size ! ; ; GFX908-LABEL: name: test_only_workitem_id_yz ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX908-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX908-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX908-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX908-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GFX908-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C1]](s64) - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GFX908-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY11]], [[C1]](s64) + ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX908-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GFX908-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY14]], [[C3]](s32) + ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32) ; GFX908-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL]] - ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX908-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX908-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GFX908-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C4]](s32) + ; GFX908-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32) ; GFX908-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; GFX908-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>) - ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]](p4) - ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX908-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) + ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) + ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[COPY10]](p4) ; GFX908-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64) - ; GFX908-NEXT: $sgpr12 = COPY [[COPY11]](s32) - ; GFX908-NEXT: $sgpr13 = COPY [[COPY12]](s32) - ; GFX908-NEXT: $sgpr14 = COPY [[COPY13]](s32) - ; GFX908-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY12]](s64) + ; GFX908-NEXT: $sgpr12 = COPY [[COPY13]](s32) + ; GFX908-NEXT: $sgpr13 = COPY [[COPY14]](s32) + ; GFX908-NEXT: $sgpr14 = COPY [[COPY15]](s32) + ; GFX908-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX908-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GFX908-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX908-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1121,45 +1135,46 @@ define amdgpu_kernel void @test_only_workitem_id_yz() #0 !reqd_work_group_size ! define amdgpu_kernel void @test_only_workitem_id_xz() #0 !reqd_work_group_size !5 { ; GFX900-LABEL: name: test_only_workitem_id_xz ; GFX900: bb.1 (%ir-block.0): - ; GFX900-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX900-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GFX900-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GFX900-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX900-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX900-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX900-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX900-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX900-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX900-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX900-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX900-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX900-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX900-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; GFX900-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GFX900-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C1]](s64) - ; GFX900-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GFX900-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GFX900-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX900-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY11]], [[C1]](s64) + ; GFX900-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GFX900-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GFX900-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX900-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GFX900-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX900-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX900-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C2]](s32) - ; GFX900-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL]] + ; GFX900-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GFX900-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY16]], [[SHL]] ; GFX900-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX900-NEXT: [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>) - ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]](p4) - ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX900-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX900-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) + ; GFX900-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) + ; GFX900-NEXT: $sgpr6_sgpr7 = COPY [[COPY10]](p4) ; GFX900-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64) - ; GFX900-NEXT: $sgpr12 = COPY [[COPY11]](s32) - ; GFX900-NEXT: $sgpr13 = COPY [[COPY12]](s32) - ; GFX900-NEXT: $sgpr14 = COPY [[COPY13]](s32) - ; GFX900-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX900-NEXT: $sgpr10_sgpr11 = COPY [[COPY12]](s64) + ; GFX900-NEXT: $sgpr12 = COPY [[COPY13]](s32) + ; GFX900-NEXT: $sgpr13 = COPY [[COPY14]](s32) + ; GFX900-NEXT: $sgpr14 = COPY [[COPY15]](s32) + ; GFX900-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX900-NEXT: $vgpr31 = COPY [[OR]](s32) ; GFX900-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX900-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1167,45 +1182,46 @@ define amdgpu_kernel void @test_only_workitem_id_xz() #0 !reqd_work_group_size ! ; ; GFX908-LABEL: name: test_only_workitem_id_xz ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GFX908-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GFX908-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GFX908-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GFX908-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) ; GFX908-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C1]](s64) - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; GFX908-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; GFX908-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX908-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY11]], [[C1]](s64) + ; GFX908-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GFX908-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GFX908-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GFX908-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GFX908-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX908-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GFX908-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C2]](s32) - ; GFX908-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL]] + ; GFX908-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GFX908-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY16]], [[SHL]] ; GFX908-NEXT: $vgpr0 = COPY [[C]](s32) - ; GFX908-NEXT: [[COPY16:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]](<4 x s32>) - ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]](p4) - ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GFX908-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GFX908-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) + ; GFX908-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) + ; GFX908-NEXT: $sgpr6_sgpr7 = COPY [[COPY10]](p4) ; GFX908-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64) - ; GFX908-NEXT: $sgpr12 = COPY [[COPY11]](s32) - ; GFX908-NEXT: $sgpr13 = COPY [[COPY12]](s32) - ; GFX908-NEXT: $sgpr14 = COPY [[COPY13]](s32) - ; GFX908-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GFX908-NEXT: $sgpr10_sgpr11 = COPY [[COPY12]](s64) + ; GFX908-NEXT: $sgpr12 = COPY [[COPY13]](s32) + ; GFX908-NEXT: $sgpr13 = COPY [[COPY14]](s32) + ; GFX908-NEXT: $sgpr14 = COPY [[COPY15]](s32) + ; GFX908-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GFX908-NEXT: $vgpr31 = COPY [[OR]](s32) ; GFX908-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GFX908-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll index 097def586e61c4..c87b2ce3eba29b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll @@ -70,57 +70,58 @@ declare hidden i32 @external_gfx_i32_func_i32(i32) #0 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %out) #0 { ; GCN-LABEL: name: test_call_external_i32_func_i32_imm ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; GCN-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; GCN-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p1) from %ir.out.kernarg.offset1, align 16, addrspace 4) ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i32_func_i32 - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; GCN-NEXT: $vgpr0 = COPY [[C]](s32) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i32_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; GCN-NEXT: G_STORE [[COPY19]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out.load, addrspace 1) + ; GCN-NEXT: G_STORE [[COPY21]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out.load, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 %val = call i32 @external_i32_func_i32(i32 42) store volatile i32 %val, ptr addrspace(1) %out @@ -154,53 +155,54 @@ define amdgpu_gfx void @test_gfx_call_external_i32_func_i32_imm(ptr addrspace(1) define amdgpu_kernel void @test_call_external_i1_func_void() #0 { ; GCN-LABEL: name: test_call_external_i1_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i1_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY19]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -231,53 +233,54 @@ define amdgpu_gfx void @test_gfx_call_external_i1_func_void() #0 { define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 { ; GCN-LABEL: name: test_call_external_i1_zeroext_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i1_zeroext_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY19]], 1 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY21]], 1 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_ZEXT]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1) @@ -292,53 +295,54 @@ define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 { define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 { ; GCN-LABEL: name: test_call_external_i1_signext_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i1_signext_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY19]], 1 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY21]], 1 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ASSERT_SEXT]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1) @@ -353,53 +357,54 @@ define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 { define amdgpu_kernel void @test_call_external_i8_func_void() #0 { ; GCN-LABEL: name: test_call_external_i8_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i8_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i8_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) ; GCN-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1) @@ -432,53 +437,54 @@ define amdgpu_gfx void @test_gfx_call_external_i8_func_void() #0 { define amdgpu_kernel void @test_call_external_i8_zeroext_func_void() #0 { ; GCN-LABEL: name: test_call_external_i8_zeroext_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i8_zeroext_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i8_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY19]], 8 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY21]], 8 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[ASSERT_ZEXT]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s8) @@ -493,53 +499,54 @@ define amdgpu_kernel void @test_call_external_i8_zeroext_func_void() #0 { define amdgpu_kernel void @test_call_external_i8_signext_func_void() #0 { ; GCN-LABEL: name: test_call_external_i8_signext_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i8_signext_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i8_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY19]], 8 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY21]], 8 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[ASSERT_SEXT]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s8) @@ -554,53 +561,54 @@ define amdgpu_kernel void @test_call_external_i8_signext_func_void() #0 { define amdgpu_kernel void @test_call_external_i16_func_void() #0 { ; GCN-LABEL: name: test_call_external_i16_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i16_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[TRUNC]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -612,53 +620,54 @@ define amdgpu_kernel void @test_call_external_i16_func_void() #0 { define amdgpu_kernel void @test_call_external_i16_zeroext_func_void() #0 { ; GCN-LABEL: name: test_call_external_i16_zeroext_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i16_zeroext_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i16_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY19]], 16 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY21]], 16 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASSERT_ZEXT]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s16) @@ -673,53 +682,54 @@ define amdgpu_kernel void @test_call_external_i16_zeroext_func_void() #0 { define amdgpu_kernel void @test_call_external_i16_signext_func_void() #0 { ; GCN-LABEL: name: test_call_external_i16_signext_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i16_signext_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i16_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY19]], 16 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[COPY21]], 16 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASSERT_SEXT]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s16) @@ -734,54 +744,55 @@ define amdgpu_kernel void @test_call_external_i16_signext_func_void() #0 { define amdgpu_kernel void @test_call_external_i32_func_void() #0 { ; GCN-LABEL: name: test_call_external_i32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; GCN-NEXT: G_STORE [[COPY19]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 %val = call i32 @external_i32_func_void() store volatile i32 %val, ptr addrspace(1) undef @@ -809,54 +820,55 @@ define amdgpu_gfx void @test_gfx_call_external_i32_func_void() #0 { define amdgpu_kernel void @test_call_external_i48_func_void() #0 { ; GCN-LABEL: name: test_call_external_i48_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i48_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i48_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[TRUNC]](s48), [[DEF]](p1) :: (volatile store (s48) into `ptr addrspace(1) undef`, align 8, addrspace 1) @@ -869,54 +881,55 @@ define amdgpu_kernel void @test_call_external_i48_func_void() #0 { define amdgpu_kernel void @test_call_external_i48_zeroext_func_void() #0 { ; GCN-LABEL: name: test_call_external_i48_zeroext_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i48_zeroext_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i48_zeroext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s48) @@ -931,54 +944,55 @@ define amdgpu_kernel void @test_call_external_i48_zeroext_func_void() #0 { define amdgpu_kernel void @test_call_external_i48_signext_func_void() #0 { ; GCN-LABEL: name: test_call_external_i48_signext_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i48_signext_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i48_signext_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s48) @@ -993,54 +1007,55 @@ define amdgpu_kernel void @test_call_external_i48_signext_func_void() #0 { define amdgpu_kernel void @test_call_external_i64_func_void() #0 { ; GCN-LABEL: name: test_call_external_i64_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i64_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i64_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[MV]](s64), [[DEF]](p1) :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -1052,54 +1067,55 @@ define amdgpu_kernel void @test_call_external_i64_func_void() #0 { define amdgpu_kernel void @test_call_external_p1_func_void() #0 { ; GCN-LABEL: name: test_call_external_p1_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_p1_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_p1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[MV]](p1), [[DEF]](p1) :: (volatile store (p1) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -1111,57 +1127,58 @@ define amdgpu_kernel void @test_call_external_p1_func_void() #0 { define amdgpu_kernel void @test_call_external_v2p1_func_void() #0 { ; GCN-LABEL: name: test_call_external_v2p1_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v2p1_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v2p1_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GCN-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32) - ; GCN-NEXT: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) + ; GCN-NEXT: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[BUILD_VECTOR]](<2 x p1>), [[DEF]](p1) :: (volatile store (<2 x p1>) into `ptr addrspace(1) undef`, addrspace 1) @@ -1174,54 +1191,55 @@ define amdgpu_kernel void @test_call_external_v2p1_func_void() #0 { define amdgpu_kernel void @test_call_external_p3_func_void() #0 { ; GCN-LABEL: name: test_call_external_p3_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_p3_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_p3_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; GCN-NEXT: G_STORE [[COPY19]](p3), [[DEF]](p3) :: (volatile store (p3) into `ptr addrspace(3) undef`, addrspace 3) + ; GCN-NEXT: G_STORE [[COPY21]](p3), [[DEF]](p3) :: (volatile store (p3) into `ptr addrspace(3) undef`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 %val = call ptr addrspace(3) @external_p3_func_void() store volatile ptr addrspace(3) %val, ptr addrspace(3) undef @@ -1231,54 +1249,55 @@ define amdgpu_kernel void @test_call_external_p3_func_void() #0 { define amdgpu_kernel void @test_call_external_v2p3_func_void() #0 { ; GCN-LABEL: name: test_call_external_v2p3_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v2p3_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v2p3_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(p3) = COPY $vgpr1 - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[COPY19]](p3), [[COPY20]](p3) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(p3) = COPY $vgpr1 + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[COPY21]](p3), [[COPY22]](p3) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p3) :: (volatile store (<2 x p3>) into `ptr addrspace(3) undef`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 @@ -1290,53 +1309,54 @@ define amdgpu_kernel void @test_call_external_v2p3_func_void() #0 { define amdgpu_kernel void @test_call_external_f16_func_void() #0 { ; GCN-LABEL: name: test_call_external_f16_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_f16_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_f16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[TRUNC]](s16), [[DEF]](p1) :: (volatile store (s16) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -1348,54 +1368,55 @@ define amdgpu_kernel void @test_call_external_f16_func_void() #0 { define amdgpu_kernel void @test_call_external_f32_func_void() #0 { ; GCN-LABEL: name: test_call_external_f32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_f32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_f32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; GCN-NEXT: G_STORE [[COPY19]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 %val = call float @external_f32_func_void() store volatile float %val, ptr addrspace(1) undef @@ -1405,54 +1426,55 @@ define amdgpu_kernel void @test_call_external_f32_func_void() #0 { define amdgpu_kernel void @test_call_external_f64_func_void() #0 { ; GCN-LABEL: name: test_call_external_f64_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_f64_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_f64_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[MV]](s64), [[DEF]](p1) :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -1464,57 +1486,58 @@ define amdgpu_kernel void @test_call_external_f64_func_void() #0 { define amdgpu_kernel void @test_call_external_v2f64_func_void() #0 { ; GCN-LABEL: name: test_call_external_v2f64_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v2f64_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v2f64_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY19]](s32), [[COPY20]](s32) - ; GCN-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) + ; GCN-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[DEF]](p1) :: (volatile store (<2 x s64>) into `ptr addrspace(1) undef`, addrspace 1) @@ -1527,54 +1550,55 @@ define amdgpu_kernel void @test_call_external_v2f64_func_void() #0 { define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 { ; GCN-LABEL: name: test_call_external_v2i32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v2i32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v2i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s32>), [[DEF]](p1) :: (volatile store (<2 x s32>) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -1586,55 +1610,56 @@ define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 { define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 { ; GCN-LABEL: name: test_call_external_v3i32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v3i32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v3i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (volatile store (<3 x s32>) into `ptr addrspace(1) undef`, align 8, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -1646,56 +1671,57 @@ define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 { define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 { ; GCN-LABEL: name: test_call_external_v4i32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v4i32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v4i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[BUILD_VECTOR]](<4 x s32>), [[DEF]](p1) :: (volatile store (<4 x s32>) into `ptr addrspace(1) undef`, align 8, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -1707,57 +1733,58 @@ define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 { define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 { ; GCN-LABEL: name: test_call_external_v5i32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v5i32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v5i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[BUILD_VECTOR]](<5 x s32>), [[DEF]](p1) :: (volatile store (<5 x s32>) into `ptr addrspace(1) undef`, align 8, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -1769,60 +1796,61 @@ define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 { define amdgpu_kernel void @test_call_external_v8i32_func_void() #0 { ; GCN-LABEL: name: test_call_external_v8i32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v8i32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v8i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GCN-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GCN-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GCN-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GCN-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GCN-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GCN-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[BUILD_VECTOR]](<8 x s32>), [[DEF]](p1) :: (volatile store (<8 x s32>) into `ptr addrspace(1) undef`, align 8, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -1834,68 +1862,69 @@ define amdgpu_kernel void @test_call_external_v8i32_func_void() #0 { define amdgpu_kernel void @test_call_external_v16i32_func_void() #0 { ; GCN-LABEL: name: test_call_external_v16i32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v16i32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v16i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GCN-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GCN-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GCN-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GCN-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GCN-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GCN-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GCN-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GCN-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GCN-NEXT: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GCN-NEXT: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GCN-NEXT: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GCN-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GCN-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GCN-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GCN-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GCN-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GCN-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GCN-NEXT: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GCN-NEXT: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GCN-NEXT: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GCN-NEXT: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GCN-NEXT: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32), [[COPY35]](s32), [[COPY36]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[BUILD_VECTOR]](<16 x s32>), [[DEF]](p1) :: (volatile store (<16 x s32>) into `ptr addrspace(1) undef`, align 8, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -1907,84 +1936,85 @@ define amdgpu_kernel void @test_call_external_v16i32_func_void() #0 { define amdgpu_kernel void @test_call_external_v32i32_func_void() #0 { ; GCN-LABEL: name: test_call_external_v32i32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v32i32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v32i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GCN-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GCN-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GCN-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GCN-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GCN-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GCN-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GCN-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GCN-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GCN-NEXT: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GCN-NEXT: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GCN-NEXT: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GCN-NEXT: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GCN-NEXT: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GCN-NEXT: [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr18 - ; GCN-NEXT: [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr19 - ; GCN-NEXT: [[COPY39:%[0-9]+]]:_(s32) = COPY $vgpr20 - ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(s32) = COPY $vgpr21 - ; GCN-NEXT: [[COPY41:%[0-9]+]]:_(s32) = COPY $vgpr22 - ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(s32) = COPY $vgpr23 - ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s32) = COPY $vgpr24 - ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY $vgpr25 - ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY $vgpr26 - ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY $vgpr27 - ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY $vgpr28 - ; GCN-NEXT: [[COPY48:%[0-9]+]]:_(s32) = COPY $vgpr29 - ; GCN-NEXT: [[COPY49:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; GCN-NEXT: [[COPY50:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32), [[COPY35]](s32), [[COPY36]](s32), [[COPY37]](s32), [[COPY38]](s32), [[COPY39]](s32), [[COPY40]](s32), [[COPY41]](s32), [[COPY42]](s32), [[COPY43]](s32), [[COPY44]](s32), [[COPY45]](s32), [[COPY46]](s32), [[COPY47]](s32), [[COPY48]](s32), [[COPY49]](s32), [[COPY50]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GCN-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GCN-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GCN-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GCN-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GCN-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GCN-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GCN-NEXT: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GCN-NEXT: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GCN-NEXT: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GCN-NEXT: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GCN-NEXT: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GCN-NEXT: [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GCN-NEXT: [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GCN-NEXT: [[COPY39:%[0-9]+]]:_(s32) = COPY $vgpr18 + ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(s32) = COPY $vgpr19 + ; GCN-NEXT: [[COPY41:%[0-9]+]]:_(s32) = COPY $vgpr20 + ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(s32) = COPY $vgpr21 + ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s32) = COPY $vgpr22 + ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY $vgpr23 + ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY $vgpr24 + ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY $vgpr25 + ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY $vgpr26 + ; GCN-NEXT: [[COPY48:%[0-9]+]]:_(s32) = COPY $vgpr27 + ; GCN-NEXT: [[COPY49:%[0-9]+]]:_(s32) = COPY $vgpr28 + ; GCN-NEXT: [[COPY50:%[0-9]+]]:_(s32) = COPY $vgpr29 + ; GCN-NEXT: [[COPY51:%[0-9]+]]:_(s32) = COPY $vgpr30 + ; GCN-NEXT: [[COPY52:%[0-9]+]]:_(s32) = COPY $vgpr31 + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32), [[COPY35]](s32), [[COPY36]](s32), [[COPY37]](s32), [[COPY38]](s32), [[COPY39]](s32), [[COPY40]](s32), [[COPY41]](s32), [[COPY42]](s32), [[COPY43]](s32), [[COPY44]](s32), [[COPY45]](s32), [[COPY46]](s32), [[COPY47]](s32), [[COPY48]](s32), [[COPY49]](s32), [[COPY50]](s32), [[COPY51]](s32), [[COPY52]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `ptr addrspace(1) undef`, align 8, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -1996,54 +2026,55 @@ define amdgpu_kernel void @test_call_external_v32i32_func_void() #0 { define amdgpu_kernel void @test_call_external_v2i16_func_void() #0 { ; GCN-LABEL: name: test_call_external_v2i16_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v2i16_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v2i16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; GCN-NEXT: G_STORE [[COPY19]](<2 x s16>), [[DEF]](p1) :: (volatile store (<2 x s16>) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: G_STORE [[COPY21]](<2 x s16>), [[DEF]](p1) :: (volatile store (<2 x s16>) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 %val = call <2 x i16> @external_v2i16_func_void() store volatile <2 x i16> %val, ptr addrspace(1) undef @@ -2053,54 +2084,55 @@ define amdgpu_kernel void @test_call_external_v2i16_func_void() #0 { define amdgpu_kernel void @test_call_external_v3i16_func_void() #0 { ; GCN-LABEL: name: test_call_external_v3i16_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v3i16_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v3i16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2114,54 +2146,55 @@ define amdgpu_kernel void @test_call_external_v3i16_func_void() #0 { define amdgpu_kernel void @test_call_external_v4i16_func_void() #0 { ; GCN-LABEL: name: test_call_external_v4i16_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v4i16_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v4i16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (volatile store (<4 x s16>) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -2173,54 +2206,55 @@ define amdgpu_kernel void @test_call_external_v4i16_func_void() #0 { define amdgpu_kernel void @test_call_external_v2f16_func_void() #0 { ; GCN-LABEL: name: test_call_external_v2f16_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v2f16_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v2f16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; GCN-NEXT: G_STORE [[COPY19]](<2 x s16>), [[DEF]](p1) :: (volatile store (<2 x s16>) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: G_STORE [[COPY21]](<2 x s16>), [[DEF]](p1) :: (volatile store (<2 x s16>) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 %val = call <2 x half> @external_v2f16_func_void() store volatile <2 x half> %val, ptr addrspace(1) undef @@ -2230,54 +2264,55 @@ define amdgpu_kernel void @test_call_external_v2f16_func_void() #0 { define amdgpu_kernel void @test_call_external_v3f16_func_void() #0 { ; GCN-LABEL: name: test_call_external_v3f16_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v3f16_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v3f16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2291,54 +2326,55 @@ define amdgpu_kernel void @test_call_external_v3f16_func_void() #0 { define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 { ; GCN-LABEL: name: test_call_external_v4f16_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v4f16_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v4f16_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (volatile store (<4 x s16>) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -2350,55 +2386,56 @@ define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 { define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 { ; GCN-LABEL: name: test_call_external_v3f32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v3f32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v3f32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (volatile store (<3 x s32>) into `ptr addrspace(1) undef`, align 16, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -2410,57 +2447,58 @@ define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 { define amdgpu_kernel void @test_call_external_v5f32_func_void() #0 { ; GCN-LABEL: name: test_call_external_v5f32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v5f32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v5f32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[BUILD_VECTOR]](<5 x s32>), [[DEF]](p1) :: (volatile store (<5 x s32>) into `ptr addrspace(1) undef`, align 32, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -2473,57 +2511,58 @@ define amdgpu_kernel void @test_call_external_v5f32_func_void() #0 { define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 { ; GCN-LABEL: name: test_call_external_i32_i64_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i32_i64_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i32_i64_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; GCN-NEXT: G_STORE [[COPY19]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: G_STORE [[MV]](s64), [[DEF]](p1) :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 %val = call { i32, i64 } @external_i32_i64_func_void() @@ -2562,56 +2601,57 @@ define amdgpu_gfx void @test_gfx_call_external_i32_i64_func_void() #0 { define amdgpu_kernel void @test_call_external_a2i32_func_void() #0 { ; GCN-LABEL: name: test_call_external_a2i32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_a2i32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_a2i32_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; GCN-NEXT: G_STORE [[COPY19]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: G_STORE [[COPY20]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: G_STORE [[COPY22]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 %val = call [2 x i32] @external_a2i32_func_void() %val.0 = extractvalue [2 x i32] %val, 0 @@ -2624,65 +2664,66 @@ define amdgpu_kernel void @test_call_external_a2i32_func_void() #0 { define amdgpu_kernel void @test_call_external_a5i8_func_void() #0 { ; GCN-LABEL: name: test_call_external_a5i8_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_a5i8_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_a5i8_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4 - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) + ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) ; GCN-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16) - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY20]](s32) + ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY22]](s32) ; GCN-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC2]](s16) - ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GCN-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) + ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY23]](s32) ; GCN-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC4]](s16) - ; GCN-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GCN-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY22]](s32) + ; GCN-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY24]](s32) ; GCN-NEXT: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC6]](s16) - ; GCN-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GCN-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[COPY23]](s32) + ; GCN-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GCN-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[COPY25]](s32) ; GCN-NEXT: [[TRUNC9:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC8]](s16) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1) @@ -2708,51 +2749,52 @@ define amdgpu_kernel void @test_call_external_a5i8_func_void() #0 { define amdgpu_kernel void @test_call_external_v32i32_i32_func_void() #0 { ; GCN-LABEL: name: test_call_external_v32i32_i32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v32i32_i32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; GCN-NEXT: $vgpr0 = COPY [[FRAME_INDEX]](p5) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v32i32_i32_func_void, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2774,51 +2816,52 @@ define amdgpu_kernel void @test_call_external_v32i32_i32_func_void() #0 { define amdgpu_kernel void @test_call_external_i32_v32i32_func_void() #0 { ; GCN-LABEL: name: test_call_external_i32_v32i32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_i32_v32i32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; GCN-NEXT: $vgpr0 = COPY [[FRAME_INDEX]](p5) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_i32_v32i32_func_void, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2840,51 +2883,52 @@ define amdgpu_kernel void @test_call_external_i32_v32i32_func_void() #0 { define amdgpu_kernel void @test_call_external_v33i32_func_void() #0 { ; GCN-LABEL: name: test_call_external_v33i32_func_void ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v33i32_func_void - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; GCN-NEXT: $vgpr0 = COPY [[FRAME_INDEX]](p5) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v33i32_func_void, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2899,60 +2943,61 @@ define amdgpu_kernel void @test_call_external_v33i32_func_void() #0 { define amdgpu_kernel void @test_call_external_v33i32_func_v33i32_i32(ptr addrspace(1) %p, i32 %idx) #0 { ; GCN-LABEL: name: test_call_external_v33i32_func_v33i32_i32 ; GCN: bb.1 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; GCN-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p1) from %ir.p.kernarg.offset1, align 16, addrspace 4) ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GCN-NEXT: %17:_(p4) = nuw nusw G_PTR_ADD [[INT]], [[C]](s64) - ; GCN-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD %17(p4) :: (dereferenceable invariant load (s32) from %ir.idx.kernarg.offset, align 8, addrspace 4) + ; GCN-NEXT: %18:_(p4) = nuw nusw G_PTR_ADD [[INT]], [[C]](s64) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD %18(p4) :: (dereferenceable invariant load (s32) from %ir.idx.kernarg.offset, align 8, addrspace 4) ; GCN-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_v33i32_func_v33i32_i32 - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p1) ; GCN-NEXT: $vgpr0 = COPY [[FRAME_INDEX]](p5) ; GCN-NEXT: $vgpr1 = COPY [[UV]](s32) ; GCN-NEXT: $vgpr2 = COPY [[UV1]](s32) ; GCN-NEXT: $vgpr3 = COPY [[LOAD1]](s32) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_v33i32_func_v33i32_i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll index cca35d66049cc7..6000e9c60aac4e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll @@ -6,17 +6,18 @@ declare hidden void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(p define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 { ; GCN-LABEL: name: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 ; GCN: bb.1 (%ir-block.1): - ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 3 ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GCN-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -24,29 +25,29 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GCN-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.1.out.val ; GCN-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GCN-NEXT: %17:_(p5) = nuw nusw G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32) + ; GCN-NEXT: %18:_(p5) = nuw nusw G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32) ; GCN-NEXT: G_STORE [[C]](s8), [[FRAME_INDEX]](p5) :: (store (s8) into %ir.in.val, addrspace 5) - ; GCN-NEXT: G_STORE [[C1]](s32), %17(p5) :: (store (s32) into %ir.in.gep1, addrspace 5) + ; GCN-NEXT: G_STORE [[C1]](s32), %18(p5) :: (store (s32) into %ir.in.gep1, addrspace 5) ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 - ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; GCN-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32) - ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32) + ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32) + ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32) ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; GCN-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg ; GCN-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 @@ -54,22 +55,22 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GCN-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GCN-NEXT: G_MEMCPY [[PTR_ADD1]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store (s64) into stack, align 4, addrspace 5), (dereferenceable load (s64) from %ir.in.val, align 4, addrspace 5) ; GCN-NEXT: $vgpr0 = COPY [[FRAME_INDEX1]](p5) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; GCN-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; GCN-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; GCN-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; GCN-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; GCN-NEXT: $vgpr31 = COPY [[OR1]](s32) ; GCN-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 8, implicit-def $scc - ; GCN-NEXT: %45:_(p5) = nuw nusw G_PTR_ADD [[FRAME_INDEX1]], [[C2]](s32) + ; GCN-NEXT: %46:_(p5) = nuw nusw G_PTR_ADD [[FRAME_INDEX1]], [[C2]](s32) ; GCN-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX1]](p5) :: (dereferenceable load (s8) from %ir.out.val, addrspace 5) - ; GCN-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD %45(p5) :: (dereferenceable load (s32) from %ir.out.gep1, addrspace 5) + ; GCN-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD %46(p5) :: (dereferenceable load (s32) from %ir.out.gep1, addrspace 5) ; GCN-NEXT: G_STORE [[LOAD]](s8), [[DEF]](p1) :: (volatile store (s8) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: G_STORE [[LOAD1]](s32), [[DEF]](p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll index 96c3575e3190c0..7691f4c30de04a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -115,48 +115,49 @@ declare hidden amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, define amdgpu_kernel void @test_call_external_void_func_void() #0 { ; CHECK-LABEL: name: test_call_external_void_func_void ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_void - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_void, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -191,12 +192,12 @@ define void @test_func_call_external_void_func_void() #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_void ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -225,50 +226,51 @@ define void @test_func_call_external_void_func_void() #0 { define amdgpu_kernel void @test_call_external_void_func_empty_struct() #0 { ; CHECK-LABEL: name: test_call_external_void_func_empty_struct ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_empty_struct - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: $vgpr0 = COPY [[C]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_empty_struct, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -280,50 +282,51 @@ define amdgpu_kernel void @test_call_external_void_func_empty_struct() #0 { define amdgpu_kernel void @test_call_external_void_func_empty_array() #0 { ; CHECK-LABEL: name: test_call_external_void_func_empty_array ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_empty_array - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: $vgpr0 = COPY [[C]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_empty_array, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -335,51 +338,52 @@ define amdgpu_kernel void @test_call_external_void_func_empty_array() #0 { define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_i1_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i1 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s1) ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -391,53 +395,54 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i1_signext ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s1) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i1_signext - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1) ; CHECK-NEXT: $vgpr0 = COPY [[SEXT]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_signext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -450,53 +455,54 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i1_zeroext ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s1) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i1_zeroext - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1) ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i1_zeroext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -509,53 +515,54 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i8_imm ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 123 ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i8 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[C]](s8) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT1]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i8, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -567,54 +574,55 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i8_signext ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s8) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i8_signext - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s16) = G_SEXT [[LOAD]](s8) ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT [[SEXT]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[SEXT1]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i8_signext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -627,54 +635,55 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i8_zeroext ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s8) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i8_zeroext - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[LOAD]](s8) ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ZEXT]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT1]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i8_zeroext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -687,51 +696,52 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_i16_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 123 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -743,53 +753,54 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i16_signext ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s16) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i16_signext - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[SEXT]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_signext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -802,53 +813,54 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i16_zeroext ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s16) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i16_zeroext - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i16_zeroext, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -861,51 +873,52 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i32_imm ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: $vgpr0 = COPY [[C]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -956,52 +969,53 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_i64_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 123 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i64 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1013,55 +1027,56 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2i64 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `ptr addrspace(1) null`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i64 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1074,56 +1089,57 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2i64_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934593 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 17179869187 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i64 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C2]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C2]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C3]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C4]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1135,55 +1151,56 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_i48(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i48 ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s48) from `ptr addrspace(1) undef`, align 8, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i48 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s48) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i48, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1196,55 +1213,56 @@ define amdgpu_kernel void @test_call_external_void_func_i48(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i48_signext(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i48_signext ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s48) from `ptr addrspace(1) undef`, align 8, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i48_signext - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD]](s48) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SEXT]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i48_signext, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1257,55 +1275,56 @@ define amdgpu_kernel void @test_call_external_void_func_i48_signext(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i48_zeroext(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i48_zeroext ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s48) from `ptr addrspace(1) undef`, align 8, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i48_zeroext - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s48) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_i48_zeroext, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1318,53 +1337,54 @@ define amdgpu_kernel void @test_call_external_void_func_i48_zeroext(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_p0_imm(ptr %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_p0_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p0) from %ir.arg.kernarg.offset1, align 16, addrspace 4) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_p0 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p0) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_p0, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1376,55 +1396,56 @@ define amdgpu_kernel void @test_call_external_void_func_p0_imm(ptr %arg) #0 { define amdgpu_kernel void @test_call_external_void_func_v2p0() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2p0 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p0>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x p0>) from `ptr addrspace(1) null`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2p0 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x p0>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2p0, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1437,17 +1458,18 @@ define amdgpu_kernel void @test_call_external_void_func_v2p0() #0 { define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v3i64 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934593 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF @@ -1456,24 +1478,24 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<3 x s64>) = G_SHUFFLE_VECTOR [[LOAD]](<2 x s64>), [[BUILD_VECTOR]], shufflemask(0, 1, 2) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i64 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C2]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C2]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C3]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C4]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHUF]](<3 x s64>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1482,16 +1504,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) ; CHECK-NEXT: $vgpr4 = COPY [[UV4]](s32) ; CHECK-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3i64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1506,17 +1528,18 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v4i64 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934593 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 17179869187 @@ -1525,24 +1548,24 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s64>) = G_SHUFFLE_VECTOR [[LOAD]](<2 x s64>), [[BUILD_VECTOR]], shufflemask(0, 1, 2, 3) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i64 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHUF]](<4 x s64>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1553,16 +1576,16 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; CHECK-NEXT: $vgpr5 = COPY [[UV5]](s32) ; CHECK-NEXT: $vgpr6 = COPY [[UV6]](s32) ; CHECK-NEXT: $vgpr7 = COPY [[UV7]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4i64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1576,51 +1599,52 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_f16_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH4400 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_f16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f16, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1632,50 +1656,51 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_f32_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_f32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: $vgpr0 = COPY [[C]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f32, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1687,54 +1712,55 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2f32_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2f32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C2]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C2]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C3]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C4]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1746,56 +1772,57 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v3f32_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3f32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1807,17 +1834,18 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v5f32_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 2.000000e+00 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 4.000000e+00 @@ -1826,24 +1854,24 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32), [[C4]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v5f32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C5]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C5]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C6]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C6]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C7]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C7]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<5 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1851,16 +1879,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) ; CHECK-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v5f32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1872,52 +1900,53 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_f64_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 4.000000e+00 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_f64 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_f64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1929,56 +1958,57 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2f64_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 4.000000e+00 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2f64 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C2]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C2]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C3]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C4]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -1990,41 +2020,42 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v3f64_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 4.000000e+00 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_FCONSTANT double 8.000000e+00 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64), [[C2]](s64) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3f64 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s64>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2033,16 +2064,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) ; CHECK-NEXT: $vgpr4 = COPY [[UV4]](s32) ; CHECK-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f64, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2054,51 +2085,52 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<2 x s16>) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i16, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2111,56 +2143,57 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v3i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<3 x s16>) from `ptr addrspace(1) undef`, align 8, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<3 x s16>) - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[DEF3]](s16) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[DEF2]](s16) ; CHECK-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s16>) ; CHECK-NEXT: $vgpr0 = COPY [[UV3]](<2 x s16>) ; CHECK-NEXT: $vgpr1 = COPY [[UV4]](<2 x s16>) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2173,56 +2206,57 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v3f16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<3 x s16>) from `ptr addrspace(1) undef`, align 8, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3f16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<3 x s16>) - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[DEF3]](s16) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[DEF2]](s16) ; CHECK-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s16>) ; CHECK-NEXT: $vgpr0 = COPY [[UV3]](<2 x s16>) ; CHECK-NEXT: $vgpr1 = COPY [[UV4]](<2 x s16>) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3f16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2235,53 +2269,54 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v4i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<4 x s16>) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2294,17 +2329,18 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v4i16_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 3 @@ -2312,38 +2348,38 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C4]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C4]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C5]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C5]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C6]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C6]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s16>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2355,57 +2391,58 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v5i16() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v5i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<5 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<5 x s16>) from `ptr addrspace(1) undef`, align 16, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v5i16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<5 x s16>) - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[DEF3]](s16) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[DEF2]](s16) ; CHECK-NEXT: [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<6 x s16>) ; CHECK-NEXT: $vgpr0 = COPY [[UV5]](<2 x s16>) ; CHECK-NEXT: $vgpr1 = COPY [[UV6]](<2 x s16>) ; CHECK-NEXT: $vgpr2 = COPY [[UV7]](<2 x s16>) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v5i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2418,58 +2455,59 @@ define amdgpu_kernel void @test_call_external_void_func_v5i16() #0 { define amdgpu_kernel void @test_call_external_void_func_v7i16() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v7i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<7 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<7 x s16>) from `ptr addrspace(1) undef`, align 16, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v7i16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<7 x s16>) - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[DEF3]](s16) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[DEF2]](s16) ; CHECK-NEXT: [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s16>) ; CHECK-NEXT: $vgpr0 = COPY [[UV7]](<2 x s16>) ; CHECK-NEXT: $vgpr1 = COPY [[UV8]](<2 x s16>) ; CHECK-NEXT: $vgpr2 = COPY [[UV9]](<2 x s16>) ; CHECK-NEXT: $vgpr3 = COPY [[UV10]](<2 x s16>) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v7i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2482,43 +2520,44 @@ define amdgpu_kernel void @test_call_external_void_func_v7i16() #0 { define amdgpu_kernel void @test_call_external_void_func_v63i16() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v63i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<63 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<63 x s16>) from `ptr addrspace(1) undef`, align 128, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v63i16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16), [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16), [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16), [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16), [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16), [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16), [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16), [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16), [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16), [[UV32:%[0-9]+]]:_(s16), [[UV33:%[0-9]+]]:_(s16), [[UV34:%[0-9]+]]:_(s16), [[UV35:%[0-9]+]]:_(s16), [[UV36:%[0-9]+]]:_(s16), [[UV37:%[0-9]+]]:_(s16), [[UV38:%[0-9]+]]:_(s16), [[UV39:%[0-9]+]]:_(s16), [[UV40:%[0-9]+]]:_(s16), [[UV41:%[0-9]+]]:_(s16), [[UV42:%[0-9]+]]:_(s16), [[UV43:%[0-9]+]]:_(s16), [[UV44:%[0-9]+]]:_(s16), [[UV45:%[0-9]+]]:_(s16), [[UV46:%[0-9]+]]:_(s16), [[UV47:%[0-9]+]]:_(s16), [[UV48:%[0-9]+]]:_(s16), [[UV49:%[0-9]+]]:_(s16), [[UV50:%[0-9]+]]:_(s16), [[UV51:%[0-9]+]]:_(s16), [[UV52:%[0-9]+]]:_(s16), [[UV53:%[0-9]+]]:_(s16), [[UV54:%[0-9]+]]:_(s16), [[UV55:%[0-9]+]]:_(s16), [[UV56:%[0-9]+]]:_(s16), [[UV57:%[0-9]+]]:_(s16), [[UV58:%[0-9]+]]:_(s16), [[UV59:%[0-9]+]]:_(s16), [[UV60:%[0-9]+]]:_(s16), [[UV61:%[0-9]+]]:_(s16), [[UV62:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<63 x s16>) - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<64 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[UV15]](s16), [[UV16]](s16), [[UV17]](s16), [[UV18]](s16), [[UV19]](s16), [[UV20]](s16), [[UV21]](s16), [[UV22]](s16), [[UV23]](s16), [[UV24]](s16), [[UV25]](s16), [[UV26]](s16), [[UV27]](s16), [[UV28]](s16), [[UV29]](s16), [[UV30]](s16), [[UV31]](s16), [[UV32]](s16), [[UV33]](s16), [[UV34]](s16), [[UV35]](s16), [[UV36]](s16), [[UV37]](s16), [[UV38]](s16), [[UV39]](s16), [[UV40]](s16), [[UV41]](s16), [[UV42]](s16), [[UV43]](s16), [[UV44]](s16), [[UV45]](s16), [[UV46]](s16), [[UV47]](s16), [[UV48]](s16), [[UV49]](s16), [[UV50]](s16), [[UV51]](s16), [[UV52]](s16), [[UV53]](s16), [[UV54]](s16), [[UV55]](s16), [[UV56]](s16), [[UV57]](s16), [[UV58]](s16), [[UV59]](s16), [[UV60]](s16), [[UV61]](s16), [[UV62]](s16), [[DEF3]](s16) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<64 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[UV15]](s16), [[UV16]](s16), [[UV17]](s16), [[UV18]](s16), [[UV19]](s16), [[UV20]](s16), [[UV21]](s16), [[UV22]](s16), [[UV23]](s16), [[UV24]](s16), [[UV25]](s16), [[UV26]](s16), [[UV27]](s16), [[UV28]](s16), [[UV29]](s16), [[UV30]](s16), [[UV31]](s16), [[UV32]](s16), [[UV33]](s16), [[UV34]](s16), [[UV35]](s16), [[UV36]](s16), [[UV37]](s16), [[UV38]](s16), [[UV39]](s16), [[UV40]](s16), [[UV41]](s16), [[UV42]](s16), [[UV43]](s16), [[UV44]](s16), [[UV45]](s16), [[UV46]](s16), [[UV47]](s16), [[UV48]](s16), [[UV49]](s16), [[UV50]](s16), [[UV51]](s16), [[UV52]](s16), [[UV53]](s16), [[UV54]](s16), [[UV55]](s16), [[UV56]](s16), [[UV57]](s16), [[UV58]](s16), [[UV59]](s16), [[UV60]](s16), [[UV61]](s16), [[UV62]](s16), [[DEF2]](s16) ; CHECK-NEXT: [[UV63:%[0-9]+]]:_(<2 x s16>), [[UV64:%[0-9]+]]:_(<2 x s16>), [[UV65:%[0-9]+]]:_(<2 x s16>), [[UV66:%[0-9]+]]:_(<2 x s16>), [[UV67:%[0-9]+]]:_(<2 x s16>), [[UV68:%[0-9]+]]:_(<2 x s16>), [[UV69:%[0-9]+]]:_(<2 x s16>), [[UV70:%[0-9]+]]:_(<2 x s16>), [[UV71:%[0-9]+]]:_(<2 x s16>), [[UV72:%[0-9]+]]:_(<2 x s16>), [[UV73:%[0-9]+]]:_(<2 x s16>), [[UV74:%[0-9]+]]:_(<2 x s16>), [[UV75:%[0-9]+]]:_(<2 x s16>), [[UV76:%[0-9]+]]:_(<2 x s16>), [[UV77:%[0-9]+]]:_(<2 x s16>), [[UV78:%[0-9]+]]:_(<2 x s16>), [[UV79:%[0-9]+]]:_(<2 x s16>), [[UV80:%[0-9]+]]:_(<2 x s16>), [[UV81:%[0-9]+]]:_(<2 x s16>), [[UV82:%[0-9]+]]:_(<2 x s16>), [[UV83:%[0-9]+]]:_(<2 x s16>), [[UV84:%[0-9]+]]:_(<2 x s16>), [[UV85:%[0-9]+]]:_(<2 x s16>), [[UV86:%[0-9]+]]:_(<2 x s16>), [[UV87:%[0-9]+]]:_(<2 x s16>), [[UV88:%[0-9]+]]:_(<2 x s16>), [[UV89:%[0-9]+]]:_(<2 x s16>), [[UV90:%[0-9]+]]:_(<2 x s16>), [[UV91:%[0-9]+]]:_(<2 x s16>), [[UV92:%[0-9]+]]:_(<2 x s16>), [[UV93:%[0-9]+]]:_(<2 x s16>), [[UV94:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<64 x s16>) ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 @@ -2555,16 +2594,16 @@ define amdgpu_kernel void @test_call_external_void_func_v63i16() #0 { ; CHECK-NEXT: $vgpr28 = COPY [[UV91]](<2 x s16>) ; CHECK-NEXT: $vgpr29 = COPY [[UV92]](<2 x s16>) ; CHECK-NEXT: $vgpr30 = COPY [[UV93]](<2 x s16>) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v63i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 4, implicit-def $scc @@ -2577,43 +2616,44 @@ define amdgpu_kernel void @test_call_external_void_func_v63i16() #0 { define amdgpu_kernel void @test_call_external_void_func_v65i16() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v65i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<65 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<65 x s16>) from `ptr addrspace(1) undef`, align 256, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v65i16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16), [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16), [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16), [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16), [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16), [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16), [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16), [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16), [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16), [[UV32:%[0-9]+]]:_(s16), [[UV33:%[0-9]+]]:_(s16), [[UV34:%[0-9]+]]:_(s16), [[UV35:%[0-9]+]]:_(s16), [[UV36:%[0-9]+]]:_(s16), [[UV37:%[0-9]+]]:_(s16), [[UV38:%[0-9]+]]:_(s16), [[UV39:%[0-9]+]]:_(s16), [[UV40:%[0-9]+]]:_(s16), [[UV41:%[0-9]+]]:_(s16), [[UV42:%[0-9]+]]:_(s16), [[UV43:%[0-9]+]]:_(s16), [[UV44:%[0-9]+]]:_(s16), [[UV45:%[0-9]+]]:_(s16), [[UV46:%[0-9]+]]:_(s16), [[UV47:%[0-9]+]]:_(s16), [[UV48:%[0-9]+]]:_(s16), [[UV49:%[0-9]+]]:_(s16), [[UV50:%[0-9]+]]:_(s16), [[UV51:%[0-9]+]]:_(s16), [[UV52:%[0-9]+]]:_(s16), [[UV53:%[0-9]+]]:_(s16), [[UV54:%[0-9]+]]:_(s16), [[UV55:%[0-9]+]]:_(s16), [[UV56:%[0-9]+]]:_(s16), [[UV57:%[0-9]+]]:_(s16), [[UV58:%[0-9]+]]:_(s16), [[UV59:%[0-9]+]]:_(s16), [[UV60:%[0-9]+]]:_(s16), [[UV61:%[0-9]+]]:_(s16), [[UV62:%[0-9]+]]:_(s16), [[UV63:%[0-9]+]]:_(s16), [[UV64:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<65 x s16>) - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<66 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[UV15]](s16), [[UV16]](s16), [[UV17]](s16), [[UV18]](s16), [[UV19]](s16), [[UV20]](s16), [[UV21]](s16), [[UV22]](s16), [[UV23]](s16), [[UV24]](s16), [[UV25]](s16), [[UV26]](s16), [[UV27]](s16), [[UV28]](s16), [[UV29]](s16), [[UV30]](s16), [[UV31]](s16), [[UV32]](s16), [[UV33]](s16), [[UV34]](s16), [[UV35]](s16), [[UV36]](s16), [[UV37]](s16), [[UV38]](s16), [[UV39]](s16), [[UV40]](s16), [[UV41]](s16), [[UV42]](s16), [[UV43]](s16), [[UV44]](s16), [[UV45]](s16), [[UV46]](s16), [[UV47]](s16), [[UV48]](s16), [[UV49]](s16), [[UV50]](s16), [[UV51]](s16), [[UV52]](s16), [[UV53]](s16), [[UV54]](s16), [[UV55]](s16), [[UV56]](s16), [[UV57]](s16), [[UV58]](s16), [[UV59]](s16), [[UV60]](s16), [[UV61]](s16), [[UV62]](s16), [[UV63]](s16), [[UV64]](s16), [[DEF3]](s16) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<66 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[UV15]](s16), [[UV16]](s16), [[UV17]](s16), [[UV18]](s16), [[UV19]](s16), [[UV20]](s16), [[UV21]](s16), [[UV22]](s16), [[UV23]](s16), [[UV24]](s16), [[UV25]](s16), [[UV26]](s16), [[UV27]](s16), [[UV28]](s16), [[UV29]](s16), [[UV30]](s16), [[UV31]](s16), [[UV32]](s16), [[UV33]](s16), [[UV34]](s16), [[UV35]](s16), [[UV36]](s16), [[UV37]](s16), [[UV38]](s16), [[UV39]](s16), [[UV40]](s16), [[UV41]](s16), [[UV42]](s16), [[UV43]](s16), [[UV44]](s16), [[UV45]](s16), [[UV46]](s16), [[UV47]](s16), [[UV48]](s16), [[UV49]](s16), [[UV50]](s16), [[UV51]](s16), [[UV52]](s16), [[UV53]](s16), [[UV54]](s16), [[UV55]](s16), [[UV56]](s16), [[UV57]](s16), [[UV58]](s16), [[UV59]](s16), [[UV60]](s16), [[UV61]](s16), [[UV62]](s16), [[UV63]](s16), [[UV64]](s16), [[DEF2]](s16) ; CHECK-NEXT: [[UV65:%[0-9]+]]:_(<2 x s16>), [[UV66:%[0-9]+]]:_(<2 x s16>), [[UV67:%[0-9]+]]:_(<2 x s16>), [[UV68:%[0-9]+]]:_(<2 x s16>), [[UV69:%[0-9]+]]:_(<2 x s16>), [[UV70:%[0-9]+]]:_(<2 x s16>), [[UV71:%[0-9]+]]:_(<2 x s16>), [[UV72:%[0-9]+]]:_(<2 x s16>), [[UV73:%[0-9]+]]:_(<2 x s16>), [[UV74:%[0-9]+]]:_(<2 x s16>), [[UV75:%[0-9]+]]:_(<2 x s16>), [[UV76:%[0-9]+]]:_(<2 x s16>), [[UV77:%[0-9]+]]:_(<2 x s16>), [[UV78:%[0-9]+]]:_(<2 x s16>), [[UV79:%[0-9]+]]:_(<2 x s16>), [[UV80:%[0-9]+]]:_(<2 x s16>), [[UV81:%[0-9]+]]:_(<2 x s16>), [[UV82:%[0-9]+]]:_(<2 x s16>), [[UV83:%[0-9]+]]:_(<2 x s16>), [[UV84:%[0-9]+]]:_(<2 x s16>), [[UV85:%[0-9]+]]:_(<2 x s16>), [[UV86:%[0-9]+]]:_(<2 x s16>), [[UV87:%[0-9]+]]:_(<2 x s16>), [[UV88:%[0-9]+]]:_(<2 x s16>), [[UV89:%[0-9]+]]:_(<2 x s16>), [[UV90:%[0-9]+]]:_(<2 x s16>), [[UV91:%[0-9]+]]:_(<2 x s16>), [[UV92:%[0-9]+]]:_(<2 x s16>), [[UV93:%[0-9]+]]:_(<2 x s16>), [[UV94:%[0-9]+]]:_(<2 x s16>), [[UV95:%[0-9]+]]:_(<2 x s16>), [[UV96:%[0-9]+]]:_(<2 x s16>), [[UV97:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<66 x s16>) ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 @@ -2653,16 +2693,16 @@ define amdgpu_kernel void @test_call_external_void_func_v65i16() #0 { ; CHECK-NEXT: $vgpr28 = COPY [[UV93]](<2 x s16>) ; CHECK-NEXT: $vgpr29 = COPY [[UV94]](<2 x s16>) ; CHECK-NEXT: $vgpr30 = COPY [[UV95]](<2 x s16>) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v65i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 8, implicit-def $scc @@ -2675,39 +2715,40 @@ define amdgpu_kernel void @test_call_external_void_func_v65i16() #0 { define amdgpu_kernel void @test_call_external_void_func_v66i16() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v66i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<66 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<66 x s16>) from `ptr addrspace(1) undef`, align 256, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v66i16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>), [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>), [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>), [[UV18:%[0-9]+]]:_(<2 x s16>), [[UV19:%[0-9]+]]:_(<2 x s16>), [[UV20:%[0-9]+]]:_(<2 x s16>), [[UV21:%[0-9]+]]:_(<2 x s16>), [[UV22:%[0-9]+]]:_(<2 x s16>), [[UV23:%[0-9]+]]:_(<2 x s16>), [[UV24:%[0-9]+]]:_(<2 x s16>), [[UV25:%[0-9]+]]:_(<2 x s16>), [[UV26:%[0-9]+]]:_(<2 x s16>), [[UV27:%[0-9]+]]:_(<2 x s16>), [[UV28:%[0-9]+]]:_(<2 x s16>), [[UV29:%[0-9]+]]:_(<2 x s16>), [[UV30:%[0-9]+]]:_(<2 x s16>), [[UV31:%[0-9]+]]:_(<2 x s16>), [[UV32:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<66 x s16>) ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg @@ -2748,16 +2789,16 @@ define amdgpu_kernel void @test_call_external_void_func_v66i16() #0 { ; CHECK-NEXT: $vgpr28 = COPY [[UV28]](<2 x s16>) ; CHECK-NEXT: $vgpr29 = COPY [[UV29]](<2 x s16>) ; CHECK-NEXT: $vgpr30 = COPY [[UV30]](<2 x s16>) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v66i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 8, implicit-def $scc @@ -2770,51 +2811,52 @@ define amdgpu_kernel void @test_call_external_void_func_v66i16() #0 { define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2f16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<2 x s16>) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2f16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2f16, csr_amdgpu, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2827,53 +2869,54 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2i32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<2 x s32>) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2886,54 +2929,55 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2i32_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C2]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C2]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C3]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C3]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C4]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C4]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -2945,17 +2989,18 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_v3i32_imm ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 @@ -2963,39 +3008,39 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -3007,17 +3052,18 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_v3i32_i32 ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 @@ -3026,40 +3072,40 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i32_i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C4]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C4]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C5]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C5]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C6]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C6]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) ; CHECK-NEXT: $vgpr3 = COPY [[C3]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3i32_i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -3071,55 +3117,56 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v4i32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<4 x s32>) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -3132,17 +3179,18 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v4i32_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 @@ -3150,40 +3198,40 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C4]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C4]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C5]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C5]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C6]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C6]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -3195,17 +3243,18 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v5i32_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 @@ -3214,24 +3263,24 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32), [[C4]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v5i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C5]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C5]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C6]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C6]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C7]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C7]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<5 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3239,16 +3288,16 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) ; CHECK-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v5i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -3260,40 +3309,41 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v8i32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<8 x s32>) from %ir.ptr, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v8i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<8 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3304,16 +3354,16 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; CHECK-NEXT: $vgpr5 = COPY [[UV5]](s32) ; CHECK-NEXT: $vgpr6 = COPY [[UV6]](s32) ; CHECK-NEXT: $vgpr7 = COPY [[UV7]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v8i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -3327,17 +3377,18 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v8i32_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 @@ -3349,24 +3400,24 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32), [[C4]](s32), [[C5]](s32), [[C6]](s32), [[C7]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v8i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C8]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C8]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C9]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C9]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C10]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C10]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<8 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3377,16 +3428,16 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; CHECK-NEXT: $vgpr5 = COPY [[UV5]](s32) ; CHECK-NEXT: $vgpr6 = COPY [[UV6]](s32) ; CHECK-NEXT: $vgpr7 = COPY [[UV7]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v8i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -3398,40 +3449,41 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v16i32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<16 x s32>) from %ir.ptr, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v16i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<16 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3450,16 +3502,16 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; CHECK-NEXT: $vgpr13 = COPY [[UV13]](s32) ; CHECK-NEXT: $vgpr14 = COPY [[UV14]](s32) ; CHECK-NEXT: $vgpr15 = COPY [[UV15]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v16i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -3473,40 +3525,41 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v32i32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>) ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg @@ -3544,16 +3597,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32) ; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32) ; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v32i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 4, implicit-def $scc @@ -3567,17 +3620,18 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; CHECK-LABEL: name: test_call_external_void_func_v32i32_i32 ; CHECK: bb.1 (%ir-block.1): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) @@ -3586,24 +3640,24 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>) ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg @@ -3644,16 +3698,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32) ; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32) ; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF2]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF3]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v32i32_i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 8, implicit-def $scc @@ -3668,17 +3722,18 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v32i32_i8_i8_i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4) @@ -3687,24 +3742,24 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 { ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s16) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_i8_i8_i16 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>) ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg @@ -3715,10 +3770,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 { ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C4]](s32) ; CHECK-NEXT: G_STORE [[ANYEXT]](s16), [[PTR_ADD2]](p5) :: (store (s16) into stack + 4, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s16) = COPY [[ANYEXT]](s16) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(s16) = COPY [[ANYEXT]](s16) ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C5]](s32) - ; CHECK-NEXT: G_STORE [[COPY18]](s16), [[PTR_ADD3]](p5) :: (store (s16) into stack + 8, align 8, addrspace 5) + ; CHECK-NEXT: G_STORE [[COPY20]](s16), [[PTR_ADD3]](p5) :: (store (s16) into stack + 8, align 8, addrspace 5) ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C6]](s32) ; CHECK-NEXT: G_STORE [[LOAD3]](s16), [[PTR_ADD4]](p5) :: (store (s16) into stack + 12, align 4, addrspace 5) @@ -3753,16 +3808,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 { ; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32) ; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32) ; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32) - ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF2]](p4) + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF3]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v32i32_i8_i8_i16, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 16, implicit-def $scc @@ -3779,17 +3834,18 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 { define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v32i32_p3_p5 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4) @@ -3798,24 +3854,24 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 { ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p5) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>) ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg @@ -3859,16 +3915,16 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 { ; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32) ; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32) ; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF2]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF3]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v32i32_p3_p5, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 12, implicit-def $scc @@ -3884,17 +3940,18 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 { define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; CHECK-LABEL: name: test_call_external_void_func_struct_i8_i32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (s8) from %ir.ptr0, align 4, addrspace 1) @@ -3903,39 +3960,39 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: ("amdgpu-noclobber" load (s32) from %ir.ptr0 + 4, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_struct_i8_i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT1]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[LOAD2]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_struct_i8_i32, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -4003,60 +4060,61 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() # define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; CHECK-LABEL: name: test_call_external_void_func_byval_struct_i8_i32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 3 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.val ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CHECK-NEXT: %14:_(p5) = nuw nusw G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32) + ; CHECK-NEXT: %15:_(p5) = nuw nusw G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32) ; CHECK-NEXT: G_STORE [[C]](s8), [[FRAME_INDEX]](p5) :: (store (s8) into %ir.val, addrspace 5) - ; CHECK-NEXT: G_STORE [[C1]](s32), %14(p5) :: (store (s32) into %ir.gep1, addrspace 5) + ; CHECK-NEXT: G_STORE [[C1]](s32), %15(p5) :: (store (s32) into %ir.gep1, addrspace 5) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_byval_struct_i8_i32 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C3]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C4]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C5]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C6]](s32) ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CHECK-NEXT: G_MEMCPY [[PTR_ADD1]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store (s64) into stack, align 4, addrspace 5), (dereferenceable load (s64) from %ir.val, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_byval_struct_i8_i32, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 8, implicit-def $scc @@ -4084,7 +4142,7 @@ define void @call_byval_3ai32_byval_i8_align32(ptr addrspace(5) %incoming0, ptr ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p5) = COPY $vgpr1 @@ -4092,7 +4150,7 @@ define void @call_byval_3ai32_byval_i8_align32(ptr addrspace(5) %incoming0, ptr ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_byval_a3i32_byval_i8_align32 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -4144,13 +4202,13 @@ define void @call_byval_a4i64_align4_higher_source_align(ptr addrspace(5) align ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @void_func_byval_a4i64_align4 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -4184,40 +4242,41 @@ define void @call_byval_a4i64_align4_higher_source_align(ptr addrspace(5) align define amdgpu_kernel void @test_call_external_void_func_v2i8() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2i8 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<2 x s8>) from %ir.ptr, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i8 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<2 x s8>) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8) @@ -4226,16 +4285,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2i8() #0 { ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT2]](s32) ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT1]](s16) ; CHECK-NEXT: $vgpr1 = COPY [[ANYEXT3]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v2i8, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -4249,40 +4308,41 @@ define amdgpu_kernel void @test_call_external_void_func_v2i8() #0 { define amdgpu_kernel void @test_call_external_void_func_v3i8() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v3i8 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<3 x s8>) from %ir.ptr, align 4, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i8 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<3 x s8>) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8) @@ -4294,16 +4354,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i8() #0 { ; CHECK-NEXT: $vgpr1 = COPY [[ANYEXT4]](s32) ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT2]](s16) ; CHECK-NEXT: $vgpr2 = COPY [[ANYEXT5]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v3i8, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -4317,40 +4377,41 @@ define amdgpu_kernel void @test_call_external_void_func_v3i8() #0 { define amdgpu_kernel void @test_call_external_void_func_v4i8() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v4i8 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<4 x s8>) from %ir.ptr, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i8 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<4 x s8>) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8) @@ -4365,16 +4426,16 @@ define amdgpu_kernel void @test_call_external_void_func_v4i8() #0 { ; CHECK-NEXT: $vgpr2 = COPY [[ANYEXT6]](s32) ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT3]](s16) ; CHECK-NEXT: $vgpr3 = COPY [[ANYEXT7]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v4i8, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -4388,40 +4449,41 @@ define amdgpu_kernel void @test_call_external_void_func_v4i8() #0 { define amdgpu_kernel void @test_call_external_void_func_v8i8() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v8i8 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<8 x s8>) from %ir.ptr, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v8i8 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<8 x s8>) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8) @@ -4448,16 +4510,16 @@ define amdgpu_kernel void @test_call_external_void_func_v8i8() #0 { ; CHECK-NEXT: $vgpr6 = COPY [[ANYEXT14]](s32) ; CHECK-NEXT: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT7]](s16) ; CHECK-NEXT: $vgpr7 = COPY [[ANYEXT15]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v8i8, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -4471,40 +4533,41 @@ define amdgpu_kernel void @test_call_external_void_func_v8i8() #0 { define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v16i8 ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<16 x s8>) from %ir.ptr, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v16i8 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8), [[UV12:%[0-9]+]]:_(s8), [[UV13:%[0-9]+]]:_(s8), [[UV14:%[0-9]+]]:_(s8), [[UV15:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD1]](<16 x s8>) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8) @@ -4555,16 +4618,16 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; CHECK-NEXT: $vgpr14 = COPY [[ANYEXT30]](s32) ; CHECK-NEXT: [[ANYEXT31:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT15]](s16) ; CHECK-NEXT: $vgpr15 = COPY [[ANYEXT31]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF1]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF2]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_v16i8, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc @@ -4578,42 +4641,43 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { ; CHECK-LABEL: name: stack_passed_arg_alignment_v32i32_f64 ; CHECK: bb.1.entry: - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (<32 x s32>) from %ir.val.kernarg.offset1, align 16, addrspace 4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 128 - ; CHECK-NEXT: %17:_(p4) = nuw nusw G_PTR_ADD [[INT]], [[C]](s64) - ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD %17(p4) :: (dereferenceable invariant load (s64) from %ir.tmp.kernarg.offset, align 16, addrspace 4) + ; CHECK-NEXT: %18:_(p4) = nuw nusw G_PTR_ADD [[INT]], [[C]](s64) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD %18(p4) :: (dereferenceable invariant load (s64) from %ir.tmp.kernarg.offset, align 16, addrspace 4) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @stack_passed_f64_arg - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 136 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C1]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C2]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C3]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<32 x s32>) ; CHECK-NEXT: [[AMDGPU_WAVE_ADDRESS:%[0-9]+]]:_(p5) = G_AMDGPU_WAVE_ADDRESS $sp_reg @@ -4658,16 +4722,16 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; CHECK-NEXT: $vgpr28 = COPY [[UV28]](s32) ; CHECK-NEXT: $vgpr29 = COPY [[UV29]](s32) ; CHECK-NEXT: $vgpr30 = COPY [[UV30]](s32) - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @stack_passed_f64_arg, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 12, implicit-def $scc @@ -4689,7 +4753,7 @@ define void @stack_12xv3i32() #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32) @@ -4722,7 +4786,7 @@ define void @stack_12xv3i32() #0 { ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_12xv3i32 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -4832,7 +4896,7 @@ define void @stack_12xv3f32() #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32) @@ -4865,7 +4929,7 @@ define void @stack_12xv3f32() #0 { ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_12xv3f32 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -4975,7 +5039,7 @@ define void @stack_8xv5i32() #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) @@ -5004,7 +5068,7 @@ define void @stack_8xv5i32() #0 { ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_8xv5i32 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5118,7 +5182,7 @@ define void @stack_8xv5f32() #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) @@ -5147,7 +5211,7 @@ define void @stack_8xv5f32() #0 { ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_8xv5f32 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5287,7 +5351,7 @@ main_body: define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i16_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5296,14 +5360,14 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i16_inreg ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5335,7 +5399,7 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 { define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i32_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5344,13 +5408,13 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i32_inreg ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5381,7 +5445,7 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 { define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_i64_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr7, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5390,15 +5454,15 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr17 ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i64_inreg ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5432,7 +5496,7 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 { define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2i32_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr7, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5441,15 +5505,15 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr17 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY9]](s32), [[COPY10]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i32_inreg ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5483,7 +5547,7 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 { define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_f16_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5492,14 +5556,14 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_f16_inreg ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5531,7 +5595,7 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 { define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_bf16_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5540,14 +5604,14 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_bf16_inreg ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5579,7 +5643,7 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_f32_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5588,13 +5652,13 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_f32_inreg ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5625,7 +5689,7 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 { define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_f64_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr7, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5634,15 +5698,15 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr17 ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_f64_inreg ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5676,7 +5740,7 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 { define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2f16_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5685,13 +5749,13 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $sgpr6 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $sgpr16 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2f16_inreg ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5723,7 +5787,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_v3f16_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr7, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5732,17 +5796,17 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $sgpr6 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $sgpr7 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $sgpr16 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $sgpr17 ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3f16_inreg ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5781,7 +5845,7 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_v4f16_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr7, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5790,15 +5854,15 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $sgpr6 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $sgpr7 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $sgpr16 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $sgpr17 ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4f16_inreg ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5834,7 +5898,7 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_p0_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr7, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5843,15 +5907,15 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr17 ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_p0_inreg ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5885,7 +5949,7 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 { define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_p1_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr7, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5894,15 +5958,15 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr17 ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_p1_inreg ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5936,7 +6000,7 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_p3_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5945,13 +6009,13 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p3) = COPY $sgpr6 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p3) = COPY $sgpr16 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_p3_inreg ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -5983,7 +6047,7 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2p1_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr7, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -5992,19 +6056,19 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr7 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr16 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr17 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr17 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr18 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $sgpr19 ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32) ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY11]](s32), [[COPY12]](s32) ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2p1_inreg ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY4]] @@ -6042,7 +6106,7 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2p5_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6, $sgpr7, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 @@ -6051,15 +6115,15 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p5) = COPY $sgpr6 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p5) = COPY $sgpr7 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p5) = COPY $sgpr16 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p5) = COPY $sgpr17 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p5>) = G_BUILD_VECTOR [[COPY9]](p5), [[COPY10]](p5) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2p5_inreg ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constant-fold-vector-op.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constant-fold-vector-op.ll index ce0e2e40e5d19d..2ba0979c72533a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constant-fold-vector-op.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constant-fold-vector-op.ll @@ -7,9 +7,9 @@ define amdgpu_kernel void @constant_fold_vector_add() { ; CHECK-LABEL: name: constant_fold_vector_add ; CHECK: bb.1.entry: - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64), [[C]](s64), [[C]](s64) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll index 6b0e9618754df8..9ec3c83fa0cfce 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @system_one_as_acquire() { ; CHECK-LABEL: name: system_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 4, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") acquire @@ -16,9 +16,9 @@ define amdgpu_kernel void @system_one_as_acquire() { define amdgpu_kernel void @system_one_as_release() { ; CHECK-LABEL: name: system_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 5, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") release @@ -28,9 +28,9 @@ define amdgpu_kernel void @system_one_as_release() { define amdgpu_kernel void @system_one_as_acq_rel() { ; CHECK-LABEL: name: system_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 6, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") acq_rel @@ -40,9 +40,9 @@ define amdgpu_kernel void @system_one_as_acq_rel() { define amdgpu_kernel void @system_one_as_seq_cst() { ; CHECK-LABEL: name: system_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 7, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") seq_cst @@ -52,9 +52,9 @@ define amdgpu_kernel void @system_one_as_seq_cst() { define amdgpu_kernel void @singlethread_one_as_acquire() { ; CHECK-LABEL: name: singlethread_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 4, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") acquire @@ -64,9 +64,9 @@ define amdgpu_kernel void @singlethread_one_as_acquire() { define amdgpu_kernel void @singlethread_one_as_release() { ; CHECK-LABEL: name: singlethread_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 5, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") release @@ -76,9 +76,9 @@ define amdgpu_kernel void @singlethread_one_as_release() { define amdgpu_kernel void @singlethread_one_as_acq_rel() { ; CHECK-LABEL: name: singlethread_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 6, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") acq_rel @@ -88,9 +88,9 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel() { define amdgpu_kernel void @singlethread_one_as_seq_cst() { ; CHECK-LABEL: name: singlethread_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 7, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") seq_cst @@ -100,9 +100,9 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst() { define amdgpu_kernel void @agent_one_as_acquire() { ; CHECK-LABEL: name: agent_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 4, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") acquire @@ -112,9 +112,9 @@ define amdgpu_kernel void @agent_one_as_acquire() { define amdgpu_kernel void @agent_one_as_release() { ; CHECK-LABEL: name: agent_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 5, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") release @@ -124,9 +124,9 @@ define amdgpu_kernel void @agent_one_as_release() { define amdgpu_kernel void @agent_one_as_acq_rel() { ; CHECK-LABEL: name: agent_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 6, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") acq_rel @@ -136,9 +136,9 @@ define amdgpu_kernel void @agent_one_as_acq_rel() { define amdgpu_kernel void @agent_one_as_seq_cst() { ; CHECK-LABEL: name: agent_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 7, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") seq_cst @@ -148,9 +148,9 @@ define amdgpu_kernel void @agent_one_as_seq_cst() { define amdgpu_kernel void @workgroup_one_as_acquire() { ; CHECK-LABEL: name: workgroup_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 4, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") acquire @@ -160,9 +160,9 @@ define amdgpu_kernel void @workgroup_one_as_acquire() { define amdgpu_kernel void @workgroup_one_as_release() { ; CHECK-LABEL: name: workgroup_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 5, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") release @@ -172,9 +172,9 @@ define amdgpu_kernel void @workgroup_one_as_release() { define amdgpu_kernel void @workgroup_one_as_acq_rel() { ; CHECK-LABEL: name: workgroup_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 6, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") acq_rel @@ -184,9 +184,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() { define amdgpu_kernel void @workgroup_one_as_seq_cst() { ; CHECK-LABEL: name: workgroup_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 7, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") seq_cst @@ -196,9 +196,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() { define amdgpu_kernel void @wavefront_one_as_acquire() { ; CHECK-LABEL: name: wavefront_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 4, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") acquire @@ -208,9 +208,9 @@ define amdgpu_kernel void @wavefront_one_as_acquire() { define amdgpu_kernel void @wavefront_one_as_release() { ; CHECK-LABEL: name: wavefront_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 5, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") release @@ -220,9 +220,9 @@ define amdgpu_kernel void @wavefront_one_as_release() { define amdgpu_kernel void @wavefront_one_as_acq_rel() { ; CHECK-LABEL: name: wavefront_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 6, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") acq_rel @@ -232,9 +232,9 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel() { define amdgpu_kernel void @wavefront_one_as_seq_cst() { ; CHECK-LABEL: name: wavefront_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 7, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") seq_cst @@ -244,9 +244,9 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst() { define amdgpu_kernel void @system_acquire() { ; CHECK-LABEL: name: system_acquire ; CHECK: bb.1.entry: - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: S_ENDPGM 0 entry: ret void @@ -255,9 +255,9 @@ entry: define amdgpu_kernel void @system_release() { ; CHECK-LABEL: name: system_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 5, 1 ; CHECK-NEXT: S_ENDPGM 0 fence release @@ -267,9 +267,9 @@ define amdgpu_kernel void @system_release() { define amdgpu_kernel void @system_acq_rel() { ; CHECK-LABEL: name: system_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 6, 1 ; CHECK-NEXT: S_ENDPGM 0 fence acq_rel @@ -279,9 +279,9 @@ define amdgpu_kernel void @system_acq_rel() { define amdgpu_kernel void @system_seq_cst() { ; CHECK-LABEL: name: system_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 7, 1 ; CHECK-NEXT: S_ENDPGM 0 fence seq_cst @@ -291,9 +291,9 @@ define amdgpu_kernel void @system_seq_cst() { define amdgpu_kernel void @singlethread_acquire() { ; CHECK-LABEL: name: singlethread_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 4, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") acquire @@ -303,9 +303,9 @@ define amdgpu_kernel void @singlethread_acquire() { define amdgpu_kernel void @singlethread_release() { ; CHECK-LABEL: name: singlethread_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 5, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") release @@ -315,9 +315,9 @@ define amdgpu_kernel void @singlethread_release() { define amdgpu_kernel void @singlethread_acq_rel() { ; CHECK-LABEL: name: singlethread_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 6, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") acq_rel @@ -327,9 +327,9 @@ define amdgpu_kernel void @singlethread_acq_rel() { define amdgpu_kernel void @singlethread_seq_cst() { ; CHECK-LABEL: name: singlethread_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 7, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") seq_cst @@ -339,9 +339,9 @@ define amdgpu_kernel void @singlethread_seq_cst() { define amdgpu_kernel void @agent_acquire() { ; CHECK-LABEL: name: agent_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 4, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") acquire @@ -351,9 +351,9 @@ define amdgpu_kernel void @agent_acquire() { define amdgpu_kernel void @agent_release() { ; CHECK-LABEL: name: agent_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 5, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") release @@ -363,9 +363,9 @@ define amdgpu_kernel void @agent_release() { define amdgpu_kernel void @agent_acq_rel() { ; CHECK-LABEL: name: agent_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 6, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") acq_rel @@ -375,9 +375,9 @@ define amdgpu_kernel void @agent_acq_rel() { define amdgpu_kernel void @agent_seq_cst() { ; CHECK-LABEL: name: agent_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 7, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") seq_cst @@ -387,9 +387,9 @@ define amdgpu_kernel void @agent_seq_cst() { define amdgpu_kernel void @workgroup_acquire() { ; CHECK-LABEL: name: workgroup_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 4, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") acquire @@ -399,9 +399,9 @@ define amdgpu_kernel void @workgroup_acquire() { define amdgpu_kernel void @workgroup_release() { ; CHECK-LABEL: name: workgroup_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 5, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") release @@ -411,9 +411,9 @@ define amdgpu_kernel void @workgroup_release() { define amdgpu_kernel void @workgroup_acq_rel() { ; CHECK-LABEL: name: workgroup_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 6, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") acq_rel @@ -423,9 +423,9 @@ define amdgpu_kernel void @workgroup_acq_rel() { define amdgpu_kernel void @workgroup_seq_cst() { ; CHECK-LABEL: name: workgroup_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 7, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") seq_cst @@ -435,9 +435,9 @@ define amdgpu_kernel void @workgroup_seq_cst() { define amdgpu_kernel void @wavefront_acquire() { ; CHECK-LABEL: name: wavefront_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 4, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") acquire @@ -447,9 +447,9 @@ define amdgpu_kernel void @wavefront_acquire() { define amdgpu_kernel void @wavefront_release() { ; CHECK-LABEL: name: wavefront_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 5, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") release @@ -459,9 +459,9 @@ define amdgpu_kernel void @wavefront_release() { define amdgpu_kernel void @wavefront_acq_rel() { ; CHECK-LABEL: name: wavefront_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 6, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") acq_rel @@ -471,9 +471,9 @@ define amdgpu_kernel void @wavefront_acq_rel() { define amdgpu_kernel void @wavefront_seq_cst() { ; CHECK-LABEL: name: wavefront_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: G_FENCE 7, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll index 0c918def3dc5c0..951be00a124c71 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll @@ -4,49 +4,50 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) { ; CHECK-LABEL: name: test_indirect_call_sgpr_ptr ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p0) from %ir.fptr.kernarg.offset1, align 16, addrspace 4) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY10]], [[C]](s64) - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C1]](s32) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C2]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY9]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[DEF]](p4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY11]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY12]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY13]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[DEF1]](s32) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]](s32) ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[LOAD]](p0), 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll index 326df0750cfbd6..c2a6b183a0f7fb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @asm_convergent() convergent{ ; CHECK-LABEL: name: asm_convergent ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: INLINEASM &s_barrier, 33 /* sideeffect isconvergent attdialect */, !1 ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "s_barrier", ""() convergent, !srcloc !0 @@ -16,9 +16,9 @@ define amdgpu_kernel void @asm_convergent() convergent{ define amdgpu_kernel void @asm_simple_memory_clobber() { ; CHECK-LABEL: name: asm_simple_memory_clobber ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: INLINEASM &"", 25 /* sideeffect mayload maystore attdialect */, !1 ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, !1 ; CHECK-NEXT: S_ENDPGM 0 @@ -30,9 +30,9 @@ define amdgpu_kernel void @asm_simple_memory_clobber() { define amdgpu_kernel void @asm_simple_vgpr_clobber() { ; CHECK-LABEL: name: asm_simple_vgpr_clobber ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, 7", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0, !1 ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "v_mov_b32 v0, 7", "~{v0}"(), !srcloc !0 @@ -42,9 +42,9 @@ define amdgpu_kernel void @asm_simple_vgpr_clobber() { define amdgpu_kernel void @asm_simple_sgpr_clobber() { ; CHECK-LABEL: name: asm_simple_sgpr_clobber ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, 7", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $sgpr0, !1 ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "s_mov_b32 s0, 7", "~{s0}"(), !srcloc !0 @@ -54,9 +54,9 @@ define amdgpu_kernel void @asm_simple_sgpr_clobber() { define amdgpu_kernel void @asm_simple_agpr_clobber() { ; CHECK-LABEL: name: asm_simple_agpr_clobber ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: INLINEASM &"; def a0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $agpr0, !1 ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "; def a0", "~{a0}"(), !srcloc !0 @@ -66,9 +66,9 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() { define i32 @asm_vgpr_early_clobber() { ; CHECK-LABEL: name: asm_vgpr_early_clobber ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %7, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %8, !1 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %7 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %8, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] ; CHECK-NEXT: $vgpr0 = COPY [[ADD]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -94,8 +94,8 @@ entry: define i32 @test_single_vgpr_output() nounwind { ; CHECK-LABEL: name: test_single_vgpr_output ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %7 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %7 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 entry: @@ -106,8 +106,8 @@ entry: define i32 @test_single_sgpr_output_s32() nounwind { ; CHECK-LABEL: name: test_single_sgpr_output_s32 ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %7 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %7 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 entry: @@ -119,9 +119,9 @@ entry: define float @test_multiple_register_outputs_same() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_same ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %7, 2228234 /* regdef:VGPR_32 */, def %8 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %7 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 2228234 /* regdef:VGPR_32 */, def %9 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]] ; CHECK-NEXT: $vgpr0 = COPY [[FADD]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -136,9 +136,9 @@ define float @test_multiple_register_outputs_same() #0 { define double @test_multiple_register_outputs_mixed() #0 { ; CHECK-LABEL: name: test_multiple_register_outputs_mixed ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %7, 3538954 /* regdef:VReg_64 */, def %8 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %7 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY %8 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 3538954 /* regdef:VReg_64 */, def %9 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY %9 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -166,9 +166,9 @@ define float @test_vector_output() nounwind { define amdgpu_kernel void @test_input_vgpr_imm() { ; CHECK-LABEL: name: test_input_vgpr_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[C]](s32) ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY1]] @@ -180,9 +180,9 @@ define amdgpu_kernel void @test_input_vgpr_imm() { define amdgpu_kernel void @test_input_sgpr_imm() { ; CHECK-LABEL: name: test_input_sgpr_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[C]](s32) ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[COPY1]] @@ -194,9 +194,9 @@ define amdgpu_kernel void @test_input_sgpr_imm() { define amdgpu_kernel void @test_input_imm() { ; CHECK-LABEL: name: test_input_imm ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 9 /* sideeffect mayload attdialect */, 13 /* imm */, 42 ; CHECK-NEXT: INLINEASM &"s_mov_b64 s[0:1], $0", 9 /* sideeffect mayload attdialect */, 13 /* imm */, 42 ; CHECK-NEXT: S_ENDPGM 0 @@ -212,8 +212,8 @@ define float @test_input_vgpr(i32 %src) nounwind { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 2228233 /* reguse:VGPR_32 */, [[COPY1]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 2228233 /* reguse:VGPR_32 */, [[COPY1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 entry: @@ -227,8 +227,8 @@ define i32 @test_memory_constraint(ptr addrspace(3) %a) nounwind { ; CHECK-NEXT: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 262158 /* mem:m */, [[COPY]](p3) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %1 = tail call i32 asm "ds_read_b32 $0, $1", "=v,*m"(ptr addrspace(3) elementtype(i32) %a) @@ -244,8 +244,8 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind { ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32) - ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %10 + ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %11 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %and = and i32 %a, 1 @@ -256,14 +256,14 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind { define i32 @test_sgpr_matching_constraint() nounwind { ; CHECK-LABEL: name: test_sgpr_matching_constraint ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %7 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %7 - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %9 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %10 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %10 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %11, 2359305 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY %11 + ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %12, 2359305 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY %12 ; CHECK-NEXT: $vgpr0 = COPY [[COPY4]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 entry: @@ -285,10 +285,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %10, 2228234 /* regdef:VGPR_32 */, def %11, 2228234 /* regdef:VGPR_32 */, def %12, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5) - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY %10 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY %11 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY %12 + ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %11, 2228234 /* regdef:VGPR_32 */, def %12, 2228234 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY %11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY %12 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY %13 ; CHECK-NEXT: G_STORE [[COPY6]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: G_STORE [[COPY7]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: G_STORE [[COPY8]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) @@ -306,11 +306,11 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind { define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind { ; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint ; CHECK: bb.1.entry: - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %7 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %7 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %10 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 entry: @@ -322,9 +322,9 @@ entry: define amdgpu_kernel void @asm_constraint_n_n() { ; CHECK-LABEL: name: asm_constraint_n_n ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 + ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: INLINEASM &"s_trap ${0:n}", 1 /* sideeffect attdialect */, 13 /* imm */, 10 ; CHECK-NEXT: S_ENDPGM 0 tail call void asm sideeffect "s_trap ${0:n}", "n"(i32 10) #1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll index 81d2f36ac8746d..7be77f403a214c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll @@ -15,11 +15,11 @@ define void @tail_call_void_func_void() { ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @external_void_func_void ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY5]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY4]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index c3938e673a6da6..b250e016492bc2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CHECK-LABEL: use_lds_globals: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 ; CHECK-NEXT: ds_read_b32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll index b8b7256011df89..39dde4bc86becd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p) { ; GCN-LABEL: name: load_zeroinit_lds_global ; GCN: bb.1 (%ir-block.0): - ; GCN: liveins: $sgpr2_sgpr3 - ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 40 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @lds ; GFX8: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_MOV_B32_1]], [[S_MOV_B32_]], implicit-def dead $scc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index 19ccb476a0a07f..459cdbd9067e00 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f32_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -48,21 +48,21 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f32_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -91,7 +91,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -110,7 +110,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f32_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -131,21 +131,21 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f32_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, v1, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -174,26 +174,26 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f64_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[4:5], v[0:1], v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -207,7 +207,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[2:3], v[2:3], v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -216,7 +216,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f64_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc @@ -224,7 +224,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[2:3], v[2:3], v[0:1] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -232,7 +232,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_div_scale_f64_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -241,7 +241,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -263,26 +263,26 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f64_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[4:5], v[2:3], v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -296,7 +296,7 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[2:3], v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -305,7 +305,7 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f64_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc @@ -313,7 +313,7 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[2:3], v[0:1] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -321,7 +321,7 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_div_scale_f64_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -330,7 +330,7 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -352,67 +352,65 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], float %a) { ; GFX7-LABEL: test_div_scale_f32_scalar_num_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s8, s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dword s8, s[4:5], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s8 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, s8 +; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_scalar_num_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: test_div_scale_f32_scalar_num_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, s4 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_scalar_num_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x54 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x54 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, s4 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -428,67 +426,65 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out, ptr addrspace(1) %in, float %a) { ; GFX7-LABEL: test_div_scale_f32_scalar_num_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dword s8, s[4:5], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s8, v0, s8 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s8, v0, s8 +; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_scalar_num_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s4, v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: test_div_scale_f32_scalar_num_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, v0, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_scale_f32 v0, s2, s4, v0, s4 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_scalar_num_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s0, v0, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_div_scale_f32 v0, null, s4, v0, s4 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -504,67 +500,65 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) { ; GFX7-LABEL: test_div_scale_f32_scalar_den_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dword s8, s[4:5], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s8, s8, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s8, s8, v0 +; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_scalar_den_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s4, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: test_div_scale_f32_scalar_den_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_scale_f32 v0, s2, s4, s4, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_scalar_den_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -580,67 +574,65 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) { ; GFX7-LABEL: test_div_scale_f32_scalar_den_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dword s8, s[4:5], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, s8, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, s8, v0 +; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_scalar_den_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: test_div_scale_f32_scalar_den_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, s0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, s4, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_scalar_den_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, s0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_div_scale_f32 v0, null, v0, s4, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -656,68 +648,67 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %a) { ; GFX7-LABEL: test_div_scale_f64_scalar_num_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[8:9] -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[8:9] +; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_scalar_num_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: test_div_scale_f64_scalar_num_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1] -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[0:1], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_scalar_num_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x54 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[0:1] -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -733,68 +724,67 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %a) { ; GFX7-LABEL: test_div_scale_f64_scalar_num_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[8:9], v[0:1], s[8:9] -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[8:9], v[0:1], s[8:9] +; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_scalar_num_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: test_div_scale_f64_scalar_num_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1] -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[6:7], v[0:1], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_scalar_num_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x54 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], v[0:1], s[0:1] -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -810,68 +800,67 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_scalar_den_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[8:9], s[8:9], v[0:1] -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[8:9], s[8:9], v[0:1] +; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_scalar_den_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], s[4:5], v[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: test_div_scale_f64_scalar_den_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1] -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[6:7], s[6:7], v[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_scalar_den_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x54 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], v[0:1] -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -887,68 +876,67 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_scalar_den_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[8:9], v[0:1] -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], s[8:9], v[0:1] +; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_scalar_den_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], s[4:5], v[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: test_div_scale_f64_scalar_den_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1] -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], s[6:7], v[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_scalar_den_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x54 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[0:1], v[0:1] -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[4:5], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -964,25 +952,25 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) { ; GFX7-LABEL: test_div_scale_f32_all_scalar_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x1c -; GFX7-NEXT: s_load_dword s5, s[2:3], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s3, s[4:5], 0x1c +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], v0, v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], v0, v0, s6 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_all_scalar_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x70 -; GFX8-NEXT: s_load_dword s1, s[2:3], 0x4c +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x70 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x4c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -992,24 +980,24 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f32_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x4c +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s5, s5, s4 +; GFX10-NEXT: v_div_scale_f32 v0, s2, s3, s3, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_all_scalar_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s4 +; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) @@ -1021,25 +1009,25 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) { ; GFX7-LABEL: test_div_scale_f32_all_scalar_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x1c -; GFX7-NEXT: s_load_dword s5, s[2:3], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s3, s[4:5], 0x1c +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], s5, v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], s6, v0, s6 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_all_scalar_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x70 -; GFX8-NEXT: s_load_dword s1, s[2:3], 0x4c +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x70 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x4c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s1, v0, s1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1049,24 +1037,24 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f32_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x4c +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s4, s5, s4 +; GFX10-NEXT: v_div_scale_f32 v0, s2, s2, s3, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_all_scalar_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s5, s4 +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) @@ -1078,12 +1066,12 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_all_scalar_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x1d +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[6:7] ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -1092,13 +1080,13 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_all_scalar_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[4:5] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1108,12 +1096,12 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f64_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x74 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[4:5], s[4:5], s[0:1] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[2:3], s[2:3], s[0:1] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm @@ -1121,12 +1109,12 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out ; GFX11-LABEL: test_div_scale_f64_all_scalar_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x74 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x74 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[0:1] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[2:3], s[0:1] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -1139,12 +1127,12 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_all_scalar_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x1d +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[6:7], v[0:1], s[6:7] ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -1153,13 +1141,13 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_all_scalar_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[4:5], v[0:1], s[4:5] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[2:3], v[0:1], s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1169,12 +1157,12 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f64_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x74 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[4:5], s[0:1] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm @@ -1182,12 +1170,12 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out ; GFX11-LABEL: test_div_scale_f64_all_scalar_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x74 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x74 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[4:5], s[0:1] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[2:3], s[0:1] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -1200,7 +1188,7 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_inline_imm_num: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1217,7 +1205,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; ; GFX8-LABEL: test_div_scale_f32_inline_imm_num: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1234,19 +1222,19 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; ; GFX10-LABEL: test_div_scale_f32_inline_imm_num: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, 1.0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_inline_imm_num: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1269,7 +1257,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_inline_imm_den: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1286,7 +1274,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; ; GFX8-LABEL: test_div_scale_f32_inline_imm_den: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1303,19 +1291,19 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; ; GFX10-LABEL: test_div_scale_f32_inline_imm_den: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, 2.0, 2.0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_inline_imm_den: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1338,7 +1326,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_fabs_num: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1358,7 +1346,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: test_div_scale_f32_fabs_num: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1380,22 +1368,22 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: test_div_scale_f32_fabs_num: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_fabs_num: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1428,7 +1416,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_fabs_den: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1448,7 +1436,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: test_div_scale_f32_fabs_den: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1470,22 +1458,22 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: test_div_scale_f32_fabs_den: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2 -; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, v1 +; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_fabs_den: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1518,7 +1506,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f32_val_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, v0 @@ -1531,7 +1519,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1540,7 +1528,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; ; GFX10-LABEL: test_div_scale_f32_val_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000 @@ -1549,7 +1537,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_div_scale_f32_val_undef_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000 @@ -1564,7 +1552,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f32_undef_val_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, s0 @@ -1577,7 +1565,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1586,7 +1574,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; ; GFX10-LABEL: test_div_scale_f32_undef_val_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0 @@ -1595,7 +1583,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_div_scale_f32_undef_val_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0 @@ -1610,7 +1598,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f32_undef_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, s0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -1621,7 +1609,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; GFX8-LABEL: test_div_scale_f32_undef_undef_val: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1630,7 +1618,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; ; GFX10-LABEL: test_div_scale_f32_undef_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0 @@ -1639,7 +1627,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; ; GFX11-LABEL: test_div_scale_f32_undef_undef_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0 @@ -1654,7 +1642,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f64_val_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x40200000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1669,7 +1657,7 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x40200000 ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1679,7 +1667,7 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; GFX10-LABEL: test_div_scale_f64_val_undef_val: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], 0x40200000 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1688,7 +1676,7 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; GFX11-LABEL: test_div_scale_f64_val_undef_val: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], 0x40200000 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll index 287546750d878d..3c60db4195d1e1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-LABEL: test_wave32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB0_2 @@ -14,7 +14,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: .LBB0_2: ; %bb -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x24 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -25,7 +25,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; ; GFX11-LABEL: test_wave32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB0_2 @@ -34,7 +34,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB0_2: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll index 06393857352b3a..d1ce73b64bca14 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) { ; GCN-LABEL: test_wave64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 @@ -13,7 +13,7 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) { ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB0_2: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll index 4a9594ad45e15c..df499006e50bde 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -134,8 +134,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX10-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -147,10 +147,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX11-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -158,7 +158,7 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN @@ -175,8 +175,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -186,16 +186,16 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll index 572894af885174..3ee5b8c1e072bf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-LABEL: test_wave32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s1, s[6:7], 0x24 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-NEXT: s_load_dword s1, s[8:9], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 @@ -22,8 +22,8 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX11-LABEL: test_wave32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cselect_b32 s0, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll index d7a82b415ff06c..78b2a5bf1050c9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) { ; GCN-LABEL: test_wave64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[6:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xa +; GCN-NEXT: s_load_dword s2, s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0xa ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cselect_b32 s2, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 614f59c564df64..b26ddbdd7a342e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 @@ -658,17 +658,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s4 -; GFX1013-NEXT: v_mov_b32_e32 v1, s5 -; GFX1013-NEXT: v_mov_b32_e32 v2, s6 -; GFX1013-NEXT: v_mov_b32_e32 v3, s7 +; GFX1013-NEXT: v_mov_b32_e32 v0, s8 +; GFX1013-NEXT: v_mov_b32_e32 v1, s9 +; GFX1013-NEXT: v_mov_b32_e32 v2, s10 +; GFX1013-NEXT: v_mov_b32_e32 v3, s11 ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 @@ -681,14 +681,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[8:11] +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[12:15] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 @@ -742,7 +742,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 @@ -769,14 +769,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s4 -; GFX1013-NEXT: v_mov_b32_e32 v1, s5 -; GFX1013-NEXT: v_mov_b32_e32 v2, s6 -; GFX1013-NEXT: v_mov_b32_e32 v3, s7 +; GFX1013-NEXT: v_mov_b32_e32 v0, s8 +; GFX1013-NEXT: v_mov_b32_e32 v1, s9 +; GFX1013-NEXT: v_mov_b32_e32 v2, s10 +; GFX1013-NEXT: v_mov_b32_e32 v3, s11 ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 @@ -789,14 +789,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16 +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 @@ -844,73 +844,46 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ } define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { -; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 -; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000 -; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000 -; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 -; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 -; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX1030-NEXT: flat_load_dword v2, v[0:1] -; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 -; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 -; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] -; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] -; GFX1030-NEXT: s_endpgm -; -; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: -; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0 -; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000 -; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000 -; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 -; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 -; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s0 -; GFX1013-NEXT: v_mov_b32_e32 v1, s1 -; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: flat_load_dword v2, v[0:1] -; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 -; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 -; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7] -; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] -; GFX1013-NEXT: s_endpgm +; GFX10-LABEL: image_bvh64_intersect_ray_nsa_reassign: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX10-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX10-NEXT: v_mov_b32_e32 v7, 4.0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0x40a00000 +; GFX10-NEXT: v_mov_b32_e32 v9, 0x40c00000 +; GFX10-NEXT: v_mov_b32_e32 v10, 0x40e00000 +; GFX10-NEXT: v_mov_b32_e32 v11, 0x41000000 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_dword v2, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, 0xb36211c7 +; GFX10-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s6, 2.0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 +; GFX11-NEXT: s_mov_b32 s5, 1.0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_mov_b32_e32 v9, 0xb36211c7 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v9, 0xb36211c7 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 @@ -921,10 +894,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: s_mov_b32 s6, 2.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo @@ -954,76 +926,51 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 } define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { -; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 -; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 -; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 -; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX1030-NEXT: flat_load_dword v2, v[0:1] -; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 -; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 -; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] -; GFX1030-NEXT: s_endpgm -; -; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: -; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1013-NEXT: v_mov_b32_e32 v3, 0 -; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200 -; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 -; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 -; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s0 -; GFX1013-NEXT: v_mov_b32_e32 v1, s1 -; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: flat_load_dword v2, v[0:1] -; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 -; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16 -; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] -; GFX1013-NEXT: s_endpgm +; GFX10-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX10-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0x44004200 +; GFX10-NEXT: v_mov_b32_e32 v7, 0x46004500 +; GFX10-NEXT: v_mov_b32_e32 v8, 0x48004700 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_dword v2, v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, 0xb36211c6 +; GFX10-NEXT: v_bfrev_b32_e32 v1, 4.0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s6, 2.0 +; GFX11-NEXT: s_mov_b32 s5, 1.0 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 -; GFX11-NEXT: s_mov_b32 s10, 0x45004800 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 +; GFX11-NEXT: s_mov_b32 s10, 0x45004800 ; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 ; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0 ; GFX11-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_mov_b32 s5, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_mov_b32 s6, 2.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll index 79164037bc6aaa..a31064e2936220 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_32x32x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[36:37], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s36, 2 @@ -81,7 +81,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_16x16x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[18:19], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s18, 2 @@ -127,13 +127,13 @@ bb: define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN-NEXT: s_mov_b64 s[6:7], 1 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GCN-NEXT: s_mov_b32 s6, 2 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-NEXT: s_mov_b64 s[4:5], 1 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: s_mov_b32 s4, 2 +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_accvgpr_write_b32 a0, s0 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1 @@ -143,7 +143,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 ; GCN-NEXT: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -157,7 +157,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_32x32x8bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[18:19], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s18, 2 @@ -204,13 +204,13 @@ bb: define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN-NEXT: s_mov_b64 s[6:7], 1 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GCN-NEXT: s_mov_b32 s6, 2 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-NEXT: s_mov_b64 s[4:5], 1 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: s_mov_b32 s4, 2 +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_accvgpr_write_b32 a0, s0 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1 @@ -221,7 +221,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -235,18 +235,18 @@ bb: define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_4x4x4f64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx2 v0, a[0:1], s[4:5] +; GCN-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1] ; GCN-NEXT: s_endpgm bb: %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0) @@ -258,8 +258,8 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -292,11 +292,11 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_imm: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 @@ -304,8 +304,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) % ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[4:5] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GCN-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) @@ -317,8 +317,8 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b64 s[6:7], 1.0 ; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] @@ -352,32 +352,32 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_mov_b32 s5, 0x405ec000 -; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s1, 0x405ec000 +; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1] -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] -; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] -; GCN-NEXT: v_accvgpr_write_b32 a0, s4 -; GCN-NEXT: v_accvgpr_write_b32 a1, s5 -; GCN-NEXT: v_accvgpr_write_b32 a2, s6 -; GCN-NEXT: v_accvgpr_write_b32 a3, s7 -; GCN-NEXT: v_accvgpr_write_b32 a4, s8 -; GCN-NEXT: v_accvgpr_write_b32 a5, s9 -; GCN-NEXT: v_accvgpr_write_b32 a6, s10 -; GCN-NEXT: v_accvgpr_write_b32 a7, s11 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: s_mov_b64 s[6:7], s[0:1] +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_accvgpr_write_b32 a4, s4 +; GCN-NEXT: v_accvgpr_write_b32 a5, s5 +; GCN-NEXT: v_accvgpr_write_b32 a6, s6 +; GCN-NEXT: v_accvgpr_write_b32 a7, s7 +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[12:13] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 ; GCN-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll index 3b402f919f3430..76e56d91e6d8c1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -8,10 +8,10 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 @@ -22,11 +22,11 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; encoding: [0x01,0x01,0x00,0xf4,0x2c,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; encoding: [0x01,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c ; encoding: [0x82,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; encoding: [0x02,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; encoding: [0x04,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x00,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] @@ -34,10 +34,10 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: dpp_test: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; encoding: [0x01,0x01,0x00,0xf4,0x2c,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; encoding: [0x01,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; encoding: [0x82,0x00,0x00,0xf4,0x2c,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; encoding: [0x02,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; encoding: [0x80,0x00,0x10,0xca,0x04,0x00,0x00,0x01] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; encoding: [0x80,0x00,0x10,0xca,0x02,0x00,0x00,0x01] ; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] @@ -48,7 +48,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; GFX8-LABEL: mov_dpp64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -61,19 +61,19 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; ; GFX10-LABEL: mov_dpp64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; encoding: [0x01,0x01,0x08,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; encoding: [0x02,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; encoding: [0x06,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; encoding: [0x07,0x02,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11] ; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11] -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x04,0x00] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX11-LABEL: mov_dpp64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; encoding: [0x01,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; encoding: [0x02,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; encoding: [0x02,0x00,0x10,0xca,0x03,0x00,0x00,0x00] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 5a4b4e62bd8ae5..45bade21385be6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -44,7 +44,7 @@ define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i3 define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_arg_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 @@ -62,7 +62,7 @@ define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; GFX6-LABEL: bfe_i32_arg_imm_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s3, 59, s3 @@ -80,7 +80,7 @@ define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_i32_imm_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s4, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -99,7 +99,7 @@ define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @v_bfe_print_arg(ptr addrspace(1) %out, ptr addrspace(1) %src0) #0 { ; GFX6-LABEL: v_bfe_print_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -118,7 +118,7 @@ define amdgpu_kernel void @v_bfe_print_arg(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_0_width_reg_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_bfe_i32 s3, s2, s3 @@ -135,11 +135,11 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_0_width_imm_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s3, s4, 8 +; GFX6-NEXT: s_bfe_i32 s3, s3, 8 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_i32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -173,7 +173,7 @@ define amdgpu_kernel void @bfe_i32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -194,7 +194,7 @@ define amdgpu_kernel void @bfe_i32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -215,7 +215,7 @@ define amdgpu_kernel void @bfe_i32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -234,7 +234,7 @@ define amdgpu_kernel void @bfe_i32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -253,7 +253,7 @@ define amdgpu_kernel void @bfe_i32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -272,7 +272,7 @@ define amdgpu_kernel void @bfe_i32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -291,7 +291,7 @@ define amdgpu_kernel void @bfe_i32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -311,7 +311,7 @@ define amdgpu_kernel void @bfe_i32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -331,7 +331,7 @@ define amdgpu_kernel void @bfe_i32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 0, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -347,7 +347,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 0x302e, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -363,7 +363,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 0, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -379,7 +379,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, -1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -411,7 +411,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x10007 ; GFX6-NEXT: s_bfe_i32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -428,7 +428,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_i32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -445,7 +445,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_i32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -462,7 +462,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80006 ; GFX6-NEXT: s_bfe_i32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -479,7 +479,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80010 ; GFX6-NEXT: s_bfe_i32 s2, 0x10000, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -496,7 +496,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_i32 s2, 0xffff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -513,7 +513,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x40004 ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -530,7 +530,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -547,7 +547,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_i32 s2, 0x1fffe, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -564,7 +564,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1e0002 ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -581,7 +581,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1c0004 ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -598,7 +598,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, -1, 0x70001 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -614,7 +614,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_17: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1f0001 ; GFX6-NEXT: s_bfe_i32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -631,7 +631,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_18: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_i32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -648,7 +648,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_sext_in_reg_i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -670,7 +670,7 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: simplify_demanded_bfe_sdiv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -694,7 +694,7 @@ define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr define amdgpu_kernel void @bfe_0_width(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_0_width: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -713,7 +713,7 @@ define amdgpu_kernel void @bfe_0_width(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @bfe_8_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_8_bfe_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -734,7 +734,7 @@ define amdgpu_kernel void @bfe_8_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @bfe_8_bfe_16(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_8_bfe_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -756,7 +756,7 @@ define amdgpu_kernel void @bfe_8_bfe_16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @bfe_16_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_16_bfe_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -778,7 +778,7 @@ define amdgpu_kernel void @bfe_16_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s2, s3 ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 @@ -799,7 +799,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(ptr addrspace(1) %out, i32 define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe_wrong: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s2, s3 ; GFX6-NEXT: s_bfe_i32 s3, s3, 8 @@ -820,7 +820,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(ptr addrspace(1) %out define amdgpu_kernel void @sextload_i8_to_i32_bfe(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: sextload_i8_to_i32_bfe: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -844,7 +844,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: sextload_i8_to_i32_bfe_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: sext_in_reg_i1_bfe_offset_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -890,7 +890,7 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(ptr addrspace(1) %out, pt define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: sext_in_reg_i1_bfe_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -912,7 +912,7 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(ptr addrspace(1) %out, pt define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: sext_in_reg_i2_bfe_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index f015099517902b..73b891e43de99e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -4,11 +4,11 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -25,7 +25,7 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -42,7 +42,7 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -65,7 +65,7 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, v0 @@ -84,17 +84,17 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NEXT: s_buffer_load_dword s7, s[0:3], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[2:3] ; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 56 +; GCN-NEXT: s_cmp_lg_u32 s7, 56 ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s2, 1 ; GCN-NEXT: s_cbranch_scc0 .LBB4_2 @@ -137,14 +137,14 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000 ; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -161,7 +161,7 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0xcccccccd ; GCN-NEXT: v_mov_b32_e32 v1, 0x4010cccc @@ -188,14 +188,14 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 ; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -212,14 +212,14 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -236,7 +236,7 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -259,7 +259,7 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -282,14 +282,14 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 ; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -306,7 +306,7 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -332,7 +332,7 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -358,7 +358,7 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -384,7 +384,7 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -407,11 +407,11 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -428,11 +428,11 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -449,11 +449,11 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -470,11 +470,11 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll index f3654fea486e0c..dd2f26f7b73a18 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll @@ -40,8 +40,8 @@ define double @v_trig_preop_f64_imm(double %a, i32 %b) { define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -57,8 +57,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; ; VI-LABEL: s_trig_preop_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -74,8 +74,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; ; GFX9-LABEL: s_trig_preop_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -86,8 +86,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; GFX10-LABEL: s_trig_preop_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], s2 ; GFX10-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -97,8 +97,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; GFX11-LABEL: s_trig_preop_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[0:1], s2 ; GFX11-NEXT: flat_store_b64 v[0:1], v[0:1] dlc @@ -112,7 +112,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64_imm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; CI-NEXT: s_add_u32 s0, s0, 4 @@ -127,7 +127,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; VI-LABEL: s_trig_preop_f64_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; VI-NEXT: s_add_u32 s0, s0, 4 @@ -142,7 +142,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX9-LABEL: s_trig_preop_f64_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -151,7 +151,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX10-LABEL: s_trig_preop_f64_imm: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX10-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -160,7 +160,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX11-LABEL: s_trig_preop_f64_imm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX11-NEXT: flat_store_b64 v[0:1], v[0:1] dlc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll index d7fbec74af3858..d327c15ae547fc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -44,7 +44,7 @@ define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i3 define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s4, s3, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -63,7 +63,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_arg_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 @@ -81,7 +81,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_arg_imm_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s3, 59, s3 @@ -99,7 +99,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_imm_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s4, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -118,7 +118,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_0_width_reg_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_bfe_u32 s3, s2, s3 @@ -135,11 +135,11 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_0_width_imm_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 8 +; GFX6-NEXT: s_bfe_u32 s3, s3, 8 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zextload_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -174,7 +174,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -197,7 +197,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -220,7 +220,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -243,7 +243,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -266,7 +266,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -289,7 +289,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -312,7 +312,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -331,7 +331,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -352,7 +352,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -373,7 +373,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -417,7 +417,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -438,7 +438,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -459,7 +459,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -480,7 +480,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -499,7 +499,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -518,7 +518,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -537,7 +537,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -557,7 +557,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -577,7 +577,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -597,7 +597,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 0, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -613,7 +613,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 0x302e, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -629,7 +629,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 0, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -645,7 +645,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -661,7 +661,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, -1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -677,7 +677,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x10007 ; GFX6-NEXT: s_bfe_u32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -694,7 +694,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_u32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -711,7 +711,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_u32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -728,7 +728,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80006 ; GFX6-NEXT: s_bfe_u32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -745,7 +745,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80010 ; GFX6-NEXT: s_bfe_u32 s2, 0x10000, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -762,7 +762,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_u32 s2, 0xffff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -779,7 +779,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x40004 ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -796,7 +796,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -813,7 +813,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_u32 s2, 0x1fffe, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -830,7 +830,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1e0002 ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -847,7 +847,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1c0004 ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -864,7 +864,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, -1, 0x70001 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -880,7 +880,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_17: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1f0001 ; GFX6-NEXT: s_bfe_u32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -897,7 +897,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_18: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_u32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -918,13 +918,13 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0, ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX6-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s0, s8, 63 ; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002 @@ -947,11 +947,11 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0 define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: lshr_and: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x30006 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -965,7 +965,7 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: v_lshr_and: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s3, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s3, 7 @@ -983,11 +983,11 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: and_lshr: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x30006 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1001,11 +1001,11 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: and_lshr2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x30006 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1019,11 +1019,11 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: shl_lshr: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x150002 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x150002 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 336a7767ee0622..41f57bb23a45f6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -19,18 +19,18 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: dpp_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -44,7 +44,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) { ; GFX8-LABEL: update_dppi64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -62,21 +62,21 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; ; GFX10-LABEL: update_dppi64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppi64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -98,7 +98,7 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) { ; GFX8-LABEL: update_dppf64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -116,21 +116,21 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; ; GFX10-LABEL: update_dppf64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppf64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) { ; GFX8-LABEL: update_dppv2i32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -170,21 +170,21 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; ; GFX10-LABEL: update_dppv2i32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppv2i32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -206,7 +206,7 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) { ; GFX8-LABEL: update_dppv2f32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -224,21 +224,21 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; ; GFX10-LABEL: update_dppv2f32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppv2f32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -260,7 +260,7 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) { ; GFX8-LABEL: update_dpp_p0_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -278,21 +278,21 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; ; GFX10-LABEL: update_dpp_p0_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dpp_p0_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -314,7 +314,7 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) { ; GFX8-LABEL: update_dpp_p3_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -329,7 +329,7 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX10-LABEL: update_dpp_p3_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -342,7 +342,7 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX11-LABEL: update_dpp_p3_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -365,11 +365,11 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; GFX8-LABEL: update_dpp_p5_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s90, -1 ; GFX8-NEXT: s_mov_b32 s91, 0xe80000 -; GFX8-NEXT: s_add_u32 s88, s88, s9 +; GFX8-NEXT: s_add_u32 s88, s88, s11 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_addc_u32 s89, s89, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -384,13 +384,13 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; ; GFX10-LABEL: update_dpp_p5_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s14, -1 ; GFX10-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-NEXT: s_add_u32 s12, s12, s9 +; GFX10-NEXT: s_add_u32 s12, s12, s11 ; GFX10-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -403,7 +403,7 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; ; GFX11-LABEL: update_dpp_p5_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 646cb48d37367b..009beeb395100c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @localize_constants(i1 %cond) { ; GFX9-LABEL: localize_constants: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s1, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s1, s1, 1 @@ -95,7 +95,7 @@ bb2: define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-LABEL: localize_globals: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s1, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s1, s1, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll index fabddb3cb84a5e..2351bf2d6e8766 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll @@ -5,9 +5,9 @@ define float @test_fmamix_constant_bus_violation_sss(i32 inreg %val.0, i32 inreg ; CHECK-LABEL: test_fmamix_constant_bus_violation_sss: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_lshr_b32 s4, s6, 16 -; CHECK-NEXT: s_lshr_b32 s5, s7, 16 -; CHECK-NEXT: s_lshr_b32 s6, s16, 16 +; CHECK-NEXT: s_lshr_b32 s5, s17, 16 +; CHECK-NEXT: s_lshr_b32 s6, s18, 16 +; CHECK-NEXT: s_lshr_b32 s4, s16, 16 ; CHECK-NEXT: v_mov_b32_e32 v0, s5 ; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: v_mad_mix_f32 v0, s4, v0, v1 op_sel_hi:[1,1,1] @@ -32,8 +32,8 @@ define float @test_fmamix_constant_bus_violation_ssv(i32 inreg %val.0, i32 inreg ; CHECK-LABEL: test_fmamix_constant_bus_violation_ssv: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_lshr_b32 s5, s7, 16 -; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: s_lshr_b32 s5, s17, 16 +; CHECK-NEXT: s_lshr_b32 s4, s16, 16 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_mad_mix_f32 v0, s4, v1, v0 op_sel:[0,0,1] op_sel_hi:[1,1,1] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -57,8 +57,8 @@ define float @test_fmamix_constant_bus_violation_svs(i32 inreg %val.0, i32 %val. ; CHECK-LABEL: test_fmamix_constant_bus_violation_svs: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_lshr_b32 s5, s7, 16 -; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: s_lshr_b32 s5, s17, 16 +; CHECK-NEXT: s_lshr_b32 s4, s16, 16 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_mad_mix_f32 v0, s4, v0, v1 op_sel:[0,1,0] op_sel_hi:[1,1,1] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -82,8 +82,8 @@ define float @test_fmamix_constant_bus_violation_vss(i32 %val.0, i32 inreg %val. ; CHECK-LABEL: test_fmamix_constant_bus_violation_vss: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_lshr_b32 s5, s7, 16 -; CHECK-NEXT: s_lshr_b32 s4, s6, 16 +; CHECK-NEXT: s_lshr_b32 s5, s17, 16 +; CHECK-NEXT: s_lshr_b32 s4, s16, 16 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_mad_mix_f32 v0, v0, s4, v1 op_sel:[1,0,0] op_sel_hi:[1,1,1] ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index b3b7457da64d1a..c87c334217b772 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -7,23 +7,23 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; GFX10-LABEL: v_mul_i64_no_zext: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[4:5] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0 ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_no_zext: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0 @@ -56,41 +56,41 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_mul_i64_zext_src1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX10-NEXT: global_load_dword v4, v3, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dword v4, v3, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v0, v4, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v4, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_src1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v1, s[6:7] -; GFX11-NEXT: global_load_b32 v5, v2, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3] +; GFX11-NEXT: global_load_b32 v5, v2, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] +; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -108,41 +108,41 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_mul_i64_zext_src0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v2, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; GFX10-NEXT: global_load_dword v4, v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v4, v1, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_src0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] +; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid @@ -160,35 +160,35 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_zext_src0_src1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_zext_src0_src1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid @@ -207,41 +207,41 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_masked_src0_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v4, v2, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX10-NEXT: global_load_dword v4, v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v4, v1, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_masked_src0_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v5, v0, s[6:7] -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] +; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -259,37 +259,37 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_masked_src0_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX10-NEXT: global_store_dwordx2 v0, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_masked_src0_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] -; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -307,38 +307,38 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_masked_src1_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: ; kill: killed $vgpr3 -; GFX10-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX10-NEXT: ; kill: killed $sgpr2_sgpr3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] -; GFX10-NEXT: ; kill: killed $sgpr0_sgpr1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] +; GFX10-NEXT: ; kill: killed $sgpr6_sgpr7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_masked_src1_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] -; GFX11-NEXT: global_load_b64 v[1:2], v2, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v2, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v1, v0, v2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -355,7 +355,7 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; GFX10-LABEL: v_mul_i64_masked_src0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -365,7 +365,7 @@ define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_mul_i64_masked_src0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -387,38 +387,38 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX10-LABEL: v_mul_i64_partially_masked_src0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v6, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v6, v3, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1] ; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, v2, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_partially_masked_src0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] -; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -431,7 +431,7 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: global_store_b64 v0, v[4:5], s[4:5] +; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -448,7 +448,7 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; GFX10-LABEL: v_mul64_masked_before_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -458,7 +458,7 @@ define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_mul64_masked_before_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -494,55 +494,55 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX10-LABEL: v_mul64_masked_before_and_in_branch: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[6:7] ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execz .LBB10_2 ; GFX10-NEXT: ; %bb.1: ; %else ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, v2, v4, 0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, v2, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s3, v2, v4, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s3, v2, v5, v[1:2] ; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX10-NEXT: .LBB10_2: ; %Flow -; GFX10-NEXT: s_andn2_saveexec_b32 s0, s0 +; GFX10-NEXT: s_andn2_saveexec_b32 s2, s2 ; GFX10-NEXT: s_cbranch_execz .LBB10_4 ; GFX10-NEXT: ; %bb.3: ; %if ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v2, v5 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: .LBB10_4: ; %endif -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul64_masked_before_and_in_branch: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[2:3], v0, s[6:7] -; GFX11-NEXT: global_load_b64 v[4:5], v0, s[0:1] -; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: global_load_b64 v[2:3], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v0, s[4:5] +; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3] -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -553,16 +553,16 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v1, v3 ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB10_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_4 ; GFX11-NEXT: ; %bb.3: ; %if ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: .LBB10_4: ; %endif -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 8cc4b7759c3a33..07c94362d04399 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2538,7 +2538,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: s_mul_u64_zext_with_sregs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2555,52 +2555,52 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: s_mul_u64_zext_with_sregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX8-NEXT: s_mulk_i32 s0, 0x50 -; GFX8-NEXT: v_readfirstlane_b32 s1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX8-NEXT: s_mulk_i32 s2, 0x50 +; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_zext_with_sregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s1, 0x50 -; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_mul_i32 s2, s3, 0x50 +; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_u64_zext_with_sregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s0, s1, 0x50 -; GFX10-NEXT: s_mul_hi_u32 s1, s1, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_mul_i32 s2, s3, 0x50 +; GFX10-NEXT: s_mul_hi_u32 s3, s3, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_u64_zext_with_sregs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s3, s[2:3], 0x0 @@ -2613,7 +2613,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: s_mul_u64_zext_with_sregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -2706,7 +2706,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: s_mul_u64_sext_with_sregs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2726,61 +2726,61 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: s_mul_u64_sext_with_sregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX8-NEXT: s_ashr_i32 s1, s0, 31 -; GFX8-NEXT: s_mulk_i32 s0, 0x50 -; GFX8-NEXT: s_mulk_i32 s1, 0x50 -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: s_add_u32 s1, s1, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX8-NEXT: s_ashr_i32 s3, s2, 31 +; GFX8-NEXT: s_mulk_i32 s2, 0x50 +; GFX8-NEXT: s_mulk_i32 s3, 0x50 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: s_add_u32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_sext_with_sregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s1, 31 -; GFX9-NEXT: s_mul_i32 s0, s1, 0x50 -; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50 -; GFX9-NEXT: s_mulk_i32 s2, 0x50 -; GFX9-NEXT: s_add_u32 s1, s2, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_mul_i32 s2, s3, 0x50 +; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50 +; GFX9-NEXT: s_mulk_i32 s4, 0x50 +; GFX9-NEXT: s_add_u32 s3, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_u64_sext_with_sregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s1, s0, 31 -; GFX10-NEXT: s_mul_hi_u32 s2, s0, 0x50 -; GFX10-NEXT: s_mulk_i32 s1, 0x50 -; GFX10-NEXT: s_mulk_i32 s0, 0x50 -; GFX10-NEXT: s_add_i32 s1, s2, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_ashr_i32 s3, s2, 31 +; GFX10-NEXT: s_mul_hi_u32 s4, s2, 0x50 +; GFX10-NEXT: s_mulk_i32 s3, 0x50 +; GFX10-NEXT: s_mulk_i32 s2, 0x50 +; GFX10-NEXT: s_add_i32 s3, s4, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_u64_sext_with_sregs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -2796,7 +2796,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: s_mul_u64_sext_with_sregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index c7afbeabbbb6b1..01287d5b7cf247 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -13,8 +13,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[6:7], 0x8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_load_dword s4, s[8:9], 0x8 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_movk_i32 s32, 0x400 @@ -22,12 +22,12 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_load_dword s4, s[6:7], 0xc +; GCN-NEXT: s_load_dword s4, s[8:9], 0xc ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_load_dword s5, s[6:7], 0x10 +; GCN-NEXT: s_load_dword s5, s[8:9], 0x10 ; GCN-NEXT: s_add_u32 s4, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -39,7 +39,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -84,8 +84,8 @@ bb.2: define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[6:7], 0x8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_load_dword s4, s[8:9], 0x8 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_movk_i32 s32, 0x1000 @@ -93,7 +93,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_load_dword s4, s[6:7], 0xc +; GCN-NEXT: s_load_dword s4, s[8:9], 0xc ; GCN-NEXT: s_add_u32 s5, s32, 0x1000 ; GCN-NEXT: s_and_b32 s5, s5, 0xfffff000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -106,7 +106,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 7d7f450e590faa..5f568839a28dd3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -6,23 +6,23 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: sdivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s8, s5, 31 -; GFX8-NEXT: s_add_i32 s0, s5, s8 -; GFX8-NEXT: s_xor_b32 s5, s0, s8 +; GFX8-NEXT: s_ashr_i32 s6, s5, 31 +; GFX8-NEXT: s_add_i32 s0, s5, s6 +; GFX8-NEXT: s_xor_b32 s5, s0, s6 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: s_ashr_i32 s7, s4, 31 +; GFX8-NEXT: s_add_i32 s4, s4, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_xor_b32 s4, s4, s7 +; GFX8-NEXT: s_xor_b32 s6, s7, s6 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s7, s6, s8 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -39,20 +39,20 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s7, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdivrem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 @@ -60,21 +60,21 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: s_xor_b32 s5, s1, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX9-NEXT: s_sub_i32 s1, 0, s5 -; GFX9-NEXT: s_ashr_i32 s8, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s8 +; GFX9-NEXT: s_ashr_i32 s6, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 +; GFX9-NEXT: s_xor_b32 s7, s0, s6 +; GFX9-NEXT: s_xor_b32 s4, s6, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 @@ -86,8 +86,8 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: global_store_dword v2, v1, s[2:3] @@ -95,17 +95,17 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s4, s1, 31 -; GFX10-NEXT: s_ashr_i32 s8, s0, 31 +; GFX10-NEXT: s_ashr_i32 s6, s0, 31 ; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_add_i32 s0, s0, s8 +; GFX10-NEXT: s_add_i32 s0, s0, s6 ; GFX10-NEXT: s_xor_b32 s5, s1, s4 -; GFX10-NEXT: s_xor_b32 s0, s0, s8 +; GFX10-NEXT: s_xor_b32 s0, s0, s6 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX10-NEXT: s_sub_i32 s1, 0, s5 -; GFX10-NEXT: s_xor_b32 s4, s8, s4 +; GFX10-NEXT: s_xor_b32 s4, s6, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -116,7 +116,7 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo @@ -128,9 +128,9 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s8, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s8, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s6, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: global_store_dword v2, v1, s[2:3] @@ -145,7 +145,7 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: sdivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 ; GFX8-NEXT: s_ashr_i32 s12, s11, 31 @@ -305,25 +305,25 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s13, 31 -; GFX9-NEXT: s_ashr_i32 s4, s15, 31 -; GFX9-NEXT: s_add_u32 s0, s12, s2 -; GFX9-NEXT: s_addc_u32 s1, s13, s2 -; GFX9-NEXT: s_add_u32 s6, s14, s4 +; GFX9-NEXT: s_ashr_i32 s2, s17, 31 +; GFX9-NEXT: s_ashr_i32 s4, s19, 31 +; GFX9-NEXT: s_add_u32 s0, s16, s2 +; GFX9-NEXT: s_addc_u32 s1, s17, s2 +; GFX9-NEXT: s_add_u32 s6, s18, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s7, s15, s4 +; GFX9-NEXT: s_addc_u32 s7, s19, s4 ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3] +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s14, 0, s6 -; GFX9-NEXT: s_subb_u32 s15, 0, s7 +; GFX9-NEXT: s_sub_u32 s10, 0, s6 +; GFX9-NEXT: s_subb_u32 s11, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -331,10 +331,10 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 @@ -356,11 +356,11 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 @@ -382,18 +382,18 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 @@ -404,13 +404,13 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s12, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2 -; GFX9-NEXT: v_sub_u32_e32 v1, s13, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] @@ -453,27 +453,27 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v6, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s2, s13, 31 -; GFX10-NEXT: s_ashr_i32 s4, s15, 31 -; GFX10-NEXT: s_add_u32 s0, s12, s2 -; GFX10-NEXT: s_addc_u32 s1, s13, s2 -; GFX10-NEXT: s_add_u32 s6, s14, s4 +; GFX10-NEXT: s_ashr_i32 s2, s17, 31 +; GFX10-NEXT: s_ashr_i32 s4, s19, 31 +; GFX10-NEXT: s_add_u32 s0, s16, s2 +; GFX10-NEXT: s_addc_u32 s1, s17, s2 +; GFX10-NEXT: s_add_u32 s6, s18, s4 ; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_addc_u32 s7, s15, s4 +; GFX10-NEXT: s_addc_u32 s7, s19, s4 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX10-NEXT: s_sub_u32 s12, 0, s6 +; GFX10-NEXT: s_sub_u32 s8, 0, s6 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -484,55 +484,55 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s13, s12, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s13, s12, v4, v[1:2] -; GFX10-NEXT: s_subb_u32 s13, 0, s7 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s9, s8, v3, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, s8, v4, v[1:2] +; GFX10-NEXT: s_subb_u32 s9, 0, s7 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0 ; GFX10-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s14, s13, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s9, v3, v[1:2] ; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0 ; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 ; GFX10-NEXT: v_mul_lo_u32 v7, v4, v1 ; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX10-NEXT: v_add_co_u32 v2, s14, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s14 -; GFX10-NEXT: v_add_co_u32 v6, s14, v7, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s14 -; GFX10-NEXT: v_add_co_u32 v0, s14, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s14 -; GFX10-NEXT: v_add_co_u32 v2, s14, v6, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s14 +; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v6, s10, v7, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v2, s10, v6, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6 -; GFX10-NEXT: v_add_co_u32 v0, s14, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s14 +; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s10 ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v0 ; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s12, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s12, v4, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s10, s8, v3, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s8, v4, v[1:2] ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s13, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s9, v3, v[1:2] ; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0 ; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 ; GFX10-NEXT: v_mul_lo_u32 v7, v4, v1 ; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s12 -; GFX10-NEXT: v_add_co_u32 v6, s12, v7, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s12 -; GFX10-NEXT: v_add_co_u32 v0, s12, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s12 -; GFX10-NEXT: v_add_co_u32 v2, s12, v6, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12 +; GFX10-NEXT: v_add_co_u32 v2, s8, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s8 +; GFX10-NEXT: v_add_co_u32 v6, s8, v7, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s8 +; GFX10-NEXT: v_add_co_u32 v0, s8, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 +; GFX10-NEXT: v_add_co_u32 v2, s8, v6, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6 -; GFX10-NEXT: v_add_co_u32 v0, s12, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12 +; GFX10-NEXT: v_add_co_u32 v0, s8, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0 ; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s1, v0 @@ -541,24 +541,24 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v5, s1, v1 -; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v3 +; GFX10-NEXT: v_add_co_u32 v2, s8, v2, v3 ; GFX10-NEXT: v_mul_hi_u32 v3, s0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12 -; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12 -; GFX10-NEXT: v_add_co_u32 v0, s12, v5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 +; GFX10-NEXT: v_add_co_u32 v2, s8, v2, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 +; GFX10-NEXT: v_add_co_u32 v0, s8, v5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 -; GFX10-NEXT: v_add_co_u32 v0, s12, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s12 -; GFX10-NEXT: v_add_co_u32 v5, s12, v0, v2 +; GFX10-NEXT: v_add_co_u32 v0, s8, v0, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 +; GFX10-NEXT: v_add_co_u32 v5, s8, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v2, s1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s12, s6, v5, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s8, s6, v5, 0 ; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s6, v3, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s7, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s6, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s7, v5, v[1:2] ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 @@ -603,8 +603,8 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v3, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v5, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s2, v6, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] ; GFX10-NEXT: s_endpgm %div = sdiv i64 %x, %y store i64 %div, ptr addrspace(1) %out0 @@ -616,7 +616,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: sdivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s10, 31 ; GFX8-NEXT: s_add_i32 s0, s10, s2 @@ -692,95 +692,95 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s14, 31 -; GFX9-NEXT: s_add_i32 s1, s14, s0 -; GFX9-NEXT: s_xor_b32 s1, s1, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_ashr_i32 s2, s15, 31 -; GFX9-NEXT: s_add_i32 s3, s15, s2 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_ashr_i32 s8, s6, 31 +; GFX9-NEXT: s_add_i32 s6, s6, s8 +; GFX9-NEXT: s_xor_b32 s6, s6, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_ashr_i32 s9, s7, 31 +; GFX9-NEXT: s_add_i32 s7, s7, s9 +; GFX9-NEXT: s_xor_b32 s7, s7, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_sub_i32 s6, 0, s1 -; GFX9-NEXT: s_ashr_i32 s4, s12, 31 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_i32 s12, 0, s6 +; GFX9-NEXT: s_ashr_i32 s10, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s7, 0, s3 -; GFX9-NEXT: s_ashr_i32 s5, s13, 31 -; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX9-NEXT: s_add_i32 s4, s4, s10 +; GFX9-NEXT: s_xor_b32 s4, s4, s10 +; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_add_i32 s6, s12, s4 +; GFX9-NEXT: s_sub_i32 s12, 0, s7 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_xor_b32 s6, s6, s4 -; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 -; GFX9-NEXT: s_add_i32 s7, s13, s5 +; GFX9-NEXT: s_ashr_i32 s11, s5, 31 +; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX9-NEXT: s_add_i32 s5, s5, s11 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX9-NEXT: s_xor_b32 s7, s7, s5 -; GFX9-NEXT: s_xor_b32 s0, s4, s0 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s1 +; GFX9-NEXT: s_xor_b32 s5, s5, s11 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s1, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v2 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX9-NEXT: s_xor_b32 s4, s10, s8 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 -; GFX9-NEXT: s_xor_b32 s0, s5, s2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: s_xor_b32 s4, s11, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 +; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 +; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s1, s14, 31 -; GFX10-NEXT: s_ashr_i32 s2, s15, 31 -; GFX10-NEXT: s_add_i32 s0, s14, s1 -; GFX10-NEXT: s_add_i32 s3, s15, s2 +; GFX10-NEXT: s_ashr_i32 s1, s18, 31 +; GFX10-NEXT: s_ashr_i32 s2, s19, 31 +; GFX10-NEXT: s_add_i32 s0, s18, s1 +; GFX10-NEXT: s_add_i32 s3, s19, s2 ; GFX10-NEXT: s_xor_b32 s4, s0, s1 ; GFX10-NEXT: s_xor_b32 s3, s3, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX10-NEXT: s_sub_i32 s0, 0, s4 ; GFX10-NEXT: s_sub_i32 s5, 0, s3 -; GFX10-NEXT: s_ashr_i32 s6, s13, 31 +; GFX10-NEXT: s_ashr_i32 s6, s17, 31 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_add_i32 s7, s13, s6 +; GFX10-NEXT: s_add_i32 s7, s17, s6 ; GFX10-NEXT: s_xor_b32 s7, s7, s6 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -788,8 +788,8 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, s5, v1 -; GFX10-NEXT: s_ashr_i32 s5, s12, 31 -; GFX10-NEXT: s_add_i32 s0, s12, s5 +; GFX10-NEXT: s_ashr_i32 s5, s16, 31 +; GFX10-NEXT: s_add_i32 s0, s16, s5 ; GFX10-NEXT: s_xor_b32 s1, s5, s1 ; GFX10-NEXT: s_xor_b32 s0, s0, s5 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -832,8 +832,8 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s5, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v3 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i32> %x, %y store <2 x i32> %div, ptr addrspace(1) %out0 @@ -845,27 +845,27 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s2, s12, 31 -; GFX8-NEXT: s_add_i32 s0, s12, s2 +; GFX8-NEXT: s_ashr_i32 s2, s16, 31 +; GFX8-NEXT: s_add_i32 s0, s16, s2 ; GFX8-NEXT: s_xor_b32 s3, s0, s2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX8-NEXT: s_sub_i32 s1, 0, s3 -; GFX8-NEXT: s_ashr_i32 s16, s13, 31 -; GFX8-NEXT: s_add_i32 s0, s13, s16 +; GFX8-NEXT: s_ashr_i32 s9, s17, 31 +; GFX8-NEXT: s_add_i32 s0, s17, s9 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_xor_b32 s13, s0, s16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s13 -; GFX8-NEXT: s_ashr_i32 s12, s8, 31 +; GFX8-NEXT: s_xor_b32 s10, s0, s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX8-NEXT: s_ashr_i32 s8, s12, 31 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s0, s8, s12 -; GFX8-NEXT: s_xor_b32 s0, s0, s12 +; GFX8-NEXT: s_add_i32 s0, s12, s8 +; GFX8-NEXT: s_xor_b32 s0, s0, s8 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX8-NEXT: s_sub_i32 s8, 0, s13 +; GFX8-NEXT: s_sub_i32 s11, 0, s10 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -883,53 +883,53 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX8-NEXT: s_xor_b32 s0, s12, s2 -; GFX8-NEXT: s_ashr_i32 s2, s9, 31 -; GFX8-NEXT: s_add_i32 s1, s9, s2 +; GFX8-NEXT: v_mul_lo_u32 v3, s11, v1 +; GFX8-NEXT: s_xor_b32 s0, s8, s2 +; GFX8-NEXT: s_ashr_i32 s2, s13, 31 +; GFX8-NEXT: s_add_i32 s1, s13, s2 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX8-NEXT: s_xor_b32 s1, s1, s2 ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, s12, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s8, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX8-NEXT: s_ashr_i32 s3, s14, 31 +; GFX8-NEXT: s_ashr_i32 s3, s18, 31 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, v1, s13 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s12, v2 -; GFX8-NEXT: s_add_i32 s0, s14, s3 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s10 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s8, v2 +; GFX8-NEXT: s_add_i32 s0, s18, s3 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX8-NEXT: s_xor_b32 s8, s0, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s10, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s10, v2 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX8-NEXT: v_mul_lo_u32 v5, s0, v3 -; GFX8-NEXT: s_ashr_i32 s9, s10, 31 -; GFX8-NEXT: s_add_i32 s1, s10, s9 -; GFX8-NEXT: s_xor_b32 s1, s1, s9 +; GFX8-NEXT: s_xor_b32 s0, s2, s9 +; GFX8-NEXT: s_ashr_i32 s9, s14, 31 +; GFX8-NEXT: s_add_i32 s1, s14, s9 ; GFX8-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX8-NEXT: s_xor_b32 s0, s2, s16 +; GFX8-NEXT: s_xor_b32 s1, s1, s9 ; GFX8-NEXT: v_xor_b32_e32 v2, s2, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s2, v2 -; GFX8-NEXT: s_ashr_i32 s2, s15, 31 +; GFX8-NEXT: s_ashr_i32 s2, s19, 31 ; GFX8-NEXT: v_mul_lo_u32 v6, v3, s8 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: s_add_i32 s0, s15, s2 +; GFX8-NEXT: s_add_i32 s0, s19, s2 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v6 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v3 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 @@ -949,8 +949,8 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v2, v7, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v6 ; GFX8-NEXT: s_xor_b32 s0, s9, s3 -; GFX8-NEXT: s_ashr_i32 s3, s11, 31 -; GFX8-NEXT: s_add_i32 s1, s11, s3 +; GFX8-NEXT: s_ashr_i32 s3, s15, 31 +; GFX8-NEXT: s_add_i32 s1, s15, s3 ; GFX8-NEXT: v_mul_hi_u32 v2, v6, v2 ; GFX8-NEXT: s_xor_b32 s1, s1, s3 ; GFX8-NEXT: v_xor_b32_e32 v3, s0, v3 @@ -986,235 +986,235 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s12, 31 -; GFX9-NEXT: s_add_i32 s0, s12, s4 -; GFX9-NEXT: s_xor_b32 s5, s0, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_ashr_i32 s6, s13, 31 -; GFX9-NEXT: s_add_i32 s7, s13, s6 +; GFX9-NEXT: s_ashr_i32 s12, s4, 31 +; GFX9-NEXT: s_add_i32 s4, s4, s12 +; GFX9-NEXT: s_xor_b32 s4, s4, s12 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_ashr_i32 s13, s5, 31 +; GFX9-NEXT: s_add_i32 s5, s5, s13 +; GFX9-NEXT: s_xor_b32 s5, s5, s13 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s7, s7, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s13, 0, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: s_sub_i32 s15, 0, s4 +; GFX9-NEXT: s_ashr_i32 s14, s0, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_ashr_i32 s12, s8, 31 -; GFX9-NEXT: s_add_i32 s8, s8, s12 -; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX9-NEXT: s_add_i32 s0, s0, s14 +; GFX9-NEXT: s_xor_b32 s0, s0, s14 +; GFX9-NEXT: v_mul_lo_u32 v2, s15, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_xor_b32 s8, s8, s12 +; GFX9-NEXT: s_sub_i32 s15, 0, s5 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_sub_i32 s13, 0, s7 -; GFX9-NEXT: v_mul_lo_u32 v3, s13, v1 -; GFX9-NEXT: s_ashr_i32 s13, s9, 31 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX9-NEXT: v_mul_lo_u32 v3, s15, v1 +; GFX9-NEXT: s_ashr_i32 s15, s1, 31 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX9-NEXT: s_add_i32 s9, s9, s13 -; GFX9-NEXT: s_xor_b32 s9, s9, s13 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s5 +; GFX9-NEXT: s_add_i32 s1, s1, s15 +; GFX9-NEXT: s_xor_b32 s1, s1, s15 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s4 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, s1, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s0, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s5, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v2 -; GFX9-NEXT: s_xor_b32 s4, s12, s4 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 -; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: s_ashr_i32 s4, s14, 31 -; GFX9-NEXT: s_add_i32 s5, s14, s4 -; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, s9, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 +; GFX9-NEXT: s_xor_b32 s0, s14, s12 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s14, v2 +; GFX9-NEXT: s_ashr_i32 s0, s6, 31 +; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s1, v3 +; GFX9-NEXT: s_add_i32 s1, s6, s0 +; GFX9-NEXT: s_xor_b32 s1, s1, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s1 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 -; GFX9-NEXT: s_sub_i32 s8, 0, s5 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: s_sub_i32 s4, 0, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s8, v3 -; GFX9-NEXT: s_xor_b32 s6, s13, s6 -; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 -; GFX9-NEXT: s_ashr_i32 s6, s15, 31 -; GFX9-NEXT: s_add_i32 s9, s15, s6 +; GFX9-NEXT: v_mul_lo_u32 v5, s4, v3 +; GFX9-NEXT: s_xor_b32 s4, s15, s13 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 +; GFX9-NEXT: s_ashr_i32 s4, s7, 31 +; GFX9-NEXT: s_add_i32 s6, s7, s4 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX9-NEXT: s_xor_b32 s9, s9, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s9 -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 -; GFX9-NEXT: s_ashr_i32 s7, s10, 31 -; GFX9-NEXT: s_add_i32 s8, s10, s7 -; GFX9-NEXT: s_xor_b32 s8, s8, s7 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s6 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: s_ashr_i32 s5, s2, 31 +; GFX9-NEXT: s_add_i32 s2, s2, s5 +; GFX9-NEXT: s_xor_b32 s2, s2, s5 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s5 +; GFX9-NEXT: v_xor_b32_e32 v2, s15, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v3, s1 ; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, s8, v6 -; GFX9-NEXT: s_sub_i32 s8, 0, s9 -; GFX9-NEXT: v_mul_lo_u32 v8, s8, v7 +; GFX9-NEXT: v_subrev_u32_e32 v5, s15, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s2, v6 +; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: v_mul_lo_u32 v8, s2, v7 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v6, s1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 -; GFX9-NEXT: s_ashr_i32 s5, s11, 31 -; GFX9-NEXT: s_add_i32 s8, s11, s5 -; GFX9-NEXT: s_xor_b32 s8, s8, s5 +; GFX9-NEXT: v_subrev_u32_e32 v6, s1, v2 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_add_i32 s2, s3, s1 +; GFX9-NEXT: s_xor_b32 s2, s2, s1 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 -; GFX9-NEXT: v_mul_hi_u32 v7, s8, v7 -; GFX9-NEXT: s_xor_b32 s4, s7, s4 +; GFX9-NEXT: v_mul_hi_u32 v7, s2, v7 +; GFX9-NEXT: s_xor_b32 s0, s5, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, v7, s9 +; GFX9-NEXT: v_xor_b32_e32 v2, s0, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, v7, s6 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 -; GFX9-NEXT: s_xor_b32 s4, s5, s6 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s0, v2 +; GFX9-NEXT: s_xor_b32 s0, s1, s4 +; GFX9-NEXT: v_sub_u32_e32 v3, s2, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 +; GFX9-NEXT: v_subrev_u32_e32 v8, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 +; GFX9-NEXT: v_subrev_u32_e32 v8, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s4, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, s7, v6 -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_xor_b32_e32 v7, s5, v8 +; GFX9-NEXT: v_xor_b32_e32 v3, s0, v7 +; GFX9-NEXT: v_xor_b32_e32 v6, s5, v6 +; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v3 +; GFX9-NEXT: v_xor_b32_e32 v7, s1, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v6 -; GFX9-NEXT: v_subrev_u32_e32 v7, s5, v7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s1, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] -; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s0, s12, 31 -; GFX10-NEXT: s_ashr_i32 s1, s13, 31 -; GFX10-NEXT: s_ashr_i32 s2, s14, 31 -; GFX10-NEXT: s_ashr_i32 s3, s15, 31 -; GFX10-NEXT: s_add_i32 s4, s12, s0 -; GFX10-NEXT: s_add_i32 s5, s13, s1 -; GFX10-NEXT: s_add_i32 s12, s14, s2 -; GFX10-NEXT: s_add_i32 s13, s15, s3 -; GFX10-NEXT: s_xor_b32 s14, s4, s0 -; GFX10-NEXT: s_xor_b32 s15, s5, s1 -; GFX10-NEXT: s_xor_b32 s12, s12, s2 +; GFX10-NEXT: s_ashr_i32 s10, s4, 31 +; GFX10-NEXT: s_ashr_i32 s11, s5, 31 +; GFX10-NEXT: s_ashr_i32 s12, s6, 31 +; GFX10-NEXT: s_ashr_i32 s13, s7, 31 +; GFX10-NEXT: s_add_i32 s4, s4, s10 +; GFX10-NEXT: s_add_i32 s5, s5, s11 +; GFX10-NEXT: s_add_i32 s6, s6, s12 +; GFX10-NEXT: s_add_i32 s7, s7, s13 +; GFX10-NEXT: s_xor_b32 s14, s4, s10 +; GFX10-NEXT: s_xor_b32 s15, s5, s11 +; GFX10-NEXT: s_xor_b32 s16, s6, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX10-NEXT: s_xor_b32 s13, s13, s3 +; GFX10-NEXT: s_xor_b32 s17, s7, s13 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s16 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s17 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_sub_i32 s4, 0, s14 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX10-NEXT: s_sub_i32 s5, 0, s15 -; GFX10-NEXT: s_sub_i32 s19, 0, s12 -; GFX10-NEXT: s_ashr_i32 s16, s8, 31 -; GFX10-NEXT: s_ashr_i32 s17, s9, 31 -; GFX10-NEXT: s_ashr_i32 s18, s10, 31 +; GFX10-NEXT: s_sub_i32 s6, 0, s16 +; GFX10-NEXT: s_ashr_i32 s18, s0, 31 +; GFX10-NEXT: s_ashr_i32 s19, s1, 31 +; GFX10-NEXT: s_ashr_i32 s20, s2, 31 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: s_xor_b32 s20, s16, s0 +; GFX10-NEXT: s_ashr_i32 s21, s3, 31 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: s_xor_b32 s21, s17, s1 +; GFX10-NEXT: s_add_i32 s0, s0, s18 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, s4, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s13 +; GFX10-NEXT: s_sub_i32 s4, 0, s17 ; GFX10-NEXT: v_mul_lo_u32 v5, s5, v1 -; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2 +; GFX10-NEXT: v_mul_lo_u32 v6, s6, v2 ; GFX10-NEXT: v_mul_lo_u32 v7, s4, v3 -; GFX10-NEXT: s_ashr_i32 s19, s11, 31 -; GFX10-NEXT: s_add_i32 s4, s8, s16 -; GFX10-NEXT: s_add_i32 s5, s9, s17 +; GFX10-NEXT: s_add_i32 s1, s1, s19 +; GFX10-NEXT: s_add_i32 s2, s2, s20 +; GFX10-NEXT: s_add_i32 s3, s3, s21 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 -; GFX10-NEXT: s_add_i32 s8, s10, s18 +; GFX10-NEXT: s_xor_b32 s0, s0, s18 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 ; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX10-NEXT: s_add_i32 s9, s11, s19 -; GFX10-NEXT: s_xor_b32 s10, s4, s16 -; GFX10-NEXT: s_xor_b32 s11, s5, s17 +; GFX10-NEXT: s_xor_b32 s1, s1, s19 +; GFX10-NEXT: s_xor_b32 s2, s2, s20 +; GFX10-NEXT: s_xor_b32 s3, s3, s21 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 -; GFX10-NEXT: s_xor_b32 s8, s8, s18 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v6 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v7 -; GFX10-NEXT: s_xor_b32 s9, s9, s19 -; GFX10-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1 -; GFX10-NEXT: v_mul_hi_u32 v2, s8, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 -; GFX10-NEXT: s_xor_b32 s22, s18, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX10-NEXT: s_xor_b32 s8, s18, s10 +; GFX10-NEXT: v_mul_hi_u32 v1, s1, v1 +; GFX10-NEXT: v_mul_hi_u32 v2, s2, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, s3, v3 +; GFX10-NEXT: s_xor_b32 s9, s19, s11 +; GFX10-NEXT: s_xor_b32 s10, s20, s12 ; GFX10-NEXT: v_mul_lo_u32 v4, v0, s14 -; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15 -; GFX10-NEXT: v_mul_lo_u32 v6, v2, s12 -; GFX10-NEXT: v_mul_lo_u32 v7, v3, s13 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15 +; GFX10-NEXT: v_mul_lo_u32 v6, v2, s16 +; GFX10-NEXT: v_mul_lo_u32 v7, v3, s17 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s10, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, s11, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s8, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, s9, v7 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s0, v4 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s1, v5 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s2, v6 +; GFX10-NEXT: v_sub_nc_u32_e32 v7, s3, v7 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s16, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s17, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s16, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s17, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 @@ -1225,38 +1225,38 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s16, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s17, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s16, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s17, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 -; GFX10-NEXT: s_xor_b32 s0, s19, s3 +; GFX10-NEXT: s_xor_b32 s0, s21, s13 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2 -; GFX10-NEXT: v_xor_b32_e32 v0, s20, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s21, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s22, v2 +; GFX10-NEXT: v_xor_b32_e32 v0, s8, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s9, v1 +; GFX10-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX10-NEXT: v_xor_b32_e32 v4, s16, v4 -; GFX10-NEXT: v_xor_b32_e32 v5, s17, v5 -; GFX10-NEXT: v_xor_b32_e32 v6, s18, v6 -; GFX10-NEXT: v_xor_b32_e32 v7, s19, v7 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s20, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s21, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s22, v2 +; GFX10-NEXT: v_xor_b32_e32 v4, s18, v4 +; GFX10-NEXT: v_xor_b32_e32 v5, s19, v5 +; GFX10-NEXT: v_xor_b32_e32 v6, s20, v6 +; GFX10-NEXT: v_xor_b32_e32 v7, s21, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s8, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s9, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s10, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s16, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s17, v5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s18, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s19, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s18, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s19, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s20, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s21, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] @@ -1271,26 +1271,26 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: sdivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s4, s13, 31 +; GFX8-NEXT: s_ashr_i32 s4, s17, 31 ; GFX8-NEXT: s_ashr_i32 s6, s1, 31 -; GFX8-NEXT: s_add_u32 s16, s12, s4 -; GFX8-NEXT: s_addc_u32 s17, s13, s4 +; GFX8-NEXT: s_add_u32 s10, s16, s4 +; GFX8-NEXT: s_addc_u32 s11, s17, s4 ; GFX8-NEXT: s_add_u32 s0, s0, s6 ; GFX8-NEXT: s_mov_b32 s7, s6 ; GFX8-NEXT: s_addc_u32 s1, s1, s6 -; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 +; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_xor_b64 s[16:17], s[16:17], s[4:5] +; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[4:5] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_u32 s18, 0, s12 -; GFX8-NEXT: s_subb_u32 s19, 0, s13 +; GFX8-NEXT: s_sub_u32 s16, 0, s8 +; GFX8-NEXT: s_subb_u32 s17, 0, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 @@ -1298,10 +1298,10 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] ; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 @@ -1324,15 +1324,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] ; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: s_xor_b64 s[18:19], s[4:5], s[6:7] -; GFX8-NEXT: s_ashr_i32 s6, s15, 31 +; GFX8-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] +; GFX8-NEXT: s_ashr_i32 s6, s19, 31 ; GFX8-NEXT: s_mov_b32 s7, s6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc @@ -1353,72 +1353,72 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: v_mul_lo_u32 v2, s17, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, s16, v1 -; GFX8-NEXT: v_mul_hi_u32 v4, s16, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, s17, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s17, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1 +; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0 +; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, s17, v1 +; GFX8-NEXT: v_mul_lo_u32 v4, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, s10, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v4, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s17 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s16, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NEXT: s_ashr_i32 s16, s3, 31 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v6, s11 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_ashr_i32 s10, s3, 31 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s17, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v1 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s12, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v9 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1 ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] -; GFX8-NEXT: s_add_u32 s0, s14, s6 -; GFX8-NEXT: s_addc_u32 s1, s15, s6 -; GFX8-NEXT: s_add_u32 s2, s2, s16 -; GFX8-NEXT: s_mov_b32 s17, s16 -; GFX8-NEXT: s_addc_u32 s3, s3, s16 -; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] +; GFX8-NEXT: s_add_u32 s0, s18, s6 +; GFX8-NEXT: s_addc_u32 s1, s19, s6 +; GFX8-NEXT: s_add_u32 s2, s2, s10 +; GFX8-NEXT: s_mov_b32 s11, s10 +; GFX8-NEXT: s_addc_u32 s3, s3, s10 +; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] ; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc ; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s12, v8 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8 ; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v12, vcc -; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] +; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v11, v1 @@ -1435,15 +1435,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v10, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v15, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], s5, v5, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], s20, v12, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v16, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] ; GFX8-NEXT: v_mul_lo_u32 v8, v12, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 -; GFX8-NEXT: v_xor_b32_e32 v9, s19, v10 +; GFX8-NEXT: v_xor_b32_e32 v9, s17, v10 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 @@ -1464,11 +1464,11 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v12, v0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, s18, v4 +; GFX8-NEXT: v_xor_b32_e32 v1, s16, v4 ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, s19 -; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s18, v1 +; GFX8-NEXT: v_mov_b32_e32 v10, s17 +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1 ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4] ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc ; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7 @@ -1497,37 +1497,37 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NEXT: v_mul_lo_u32 v7, s13, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s12, v3 +; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2 +; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc -; GFX8-NEXT: v_mul_hi_u32 v6, s12, v2 +; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s13, v3 -; GFX8-NEXT: v_mul_hi_u32 v2, s13, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3 +; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s12, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s13, v3 +; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s13 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s12, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, s9 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s13, v6 +; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s9, v6 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 @@ -1560,7 +1560,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v2, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[16:17] +; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] ; GFX8-NEXT: v_xor_b32_e32 v2, s0, v8 ; GFX8-NEXT: v_xor_b32_e32 v3, s1, v9 ; GFX8-NEXT: v_mov_b32_e32 v8, s1 @@ -1571,37 +1571,37 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mov_b32_e32 v8, s6 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc -; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: v_mov_b32_e32 v9, s9 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s13, 31 +; GFX9-NEXT: s_ashr_i32 s4, s17, 31 ; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_u32 s16, s12, s4 -; GFX9-NEXT: s_addc_u32 s17, s13, s4 +; GFX9-NEXT: s_add_u32 s10, s16, s4 +; GFX9-NEXT: s_addc_u32 s11, s17, s4 ; GFX9-NEXT: s_add_u32 s0, s0, s6 ; GFX9-NEXT: s_mov_b32 s7, s6 ; GFX9-NEXT: s_addc_u32 s1, s1, s6 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_xor_b64 s[16:17], s[16:17], s[4:5] +; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[4:5] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s18, 0, s12 -; GFX9-NEXT: s_subb_u32 s19, 0, s13 +; GFX9-NEXT: s_sub_u32 s16, 0, s8 +; GFX9-NEXT: s_subb_u32 s17, 0, s9 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1609,10 +1609,10 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 @@ -1634,16 +1634,16 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: s_xor_b64 s[18:19], s[4:5], s[6:7] -; GFX9-NEXT: s_ashr_i32 s6, s15, 31 +; GFX9-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] +; GFX9-NEXT: s_ashr_i32 s6, s19, 31 ; GFX9-NEXT: s_mov_b32 s7, s6 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc @@ -1663,68 +1663,68 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0 +; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s11, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s10, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v5, 0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v4, v3, v0, v6 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v4, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s17 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s16, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v5, v[2:3] -; GFX9-NEXT: s_ashr_i32 s16, s3, 31 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s10, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[2:3] +; GFX9-NEXT: s_ashr_i32 s10, s3, 31 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v1, s17, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 +; GFX9-NEXT: v_sub_u32_e32 v1, s11, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s12, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1] ; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], 1, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v12, s[0:1], 0, v4, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v2, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v12, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s14, s6 -; GFX9-NEXT: s_addc_u32 s1, s15, s6 -; GFX9-NEXT: s_add_u32 s2, s2, s16 -; GFX9-NEXT: s_mov_b32 s17, s16 -; GFX9-NEXT: s_addc_u32 s3, s3, s16 -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] +; GFX9-NEXT: s_add_u32 s0, s18, s6 +; GFX9-NEXT: s_addc_u32 s1, s19, s6 +; GFX9-NEXT: s_add_u32 s2, s2, s10 +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: s_addc_u32 s3, s3, s10 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v16, s2 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v16 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s12, v10 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v17 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v18, 0 @@ -1744,7 +1744,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v14, v[2:3] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v4, v12, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[14:15], s20, v18, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s20, v18, v[2:3] ; GFX9-NEXT: v_mul_lo_u32 v3, v14, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v16, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2 @@ -1768,17 +1768,17 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v18, v1 ; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[14:15], s5, v11, 0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[18:19], s5, v11, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v14, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v12, v[1:2] -; GFX9-NEXT: v_xor_b32_e32 v8, s18, v5 -; GFX9-NEXT: v_xor_b32_e32 v9, s19, v9 +; GFX9-NEXT: v_xor_b32_e32 v8, s16, v5 +; GFX9-NEXT: v_xor_b32_e32 v9, s17, v9 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s20, v11, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v10, s19 -; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s18, v8 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s16, v8 ; GFX9-NEXT: v_xor_b32_e32 v5, s4, v7 ; GFX9-NEXT: v_mul_lo_u32 v7, v12, v3 ; GFX9-NEXT: v_mul_lo_u32 v8, v11, v4 @@ -1803,18 +1803,18 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v4, v8, v7, v4 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s13, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s12, v4 -; GFX9-NEXT: v_mul_hi_u32 v10, s12, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s13, v3 -; GFX9-NEXT: v_mul_hi_u32 v12, s13, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, s9, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, s8, v4 +; GFX9-NEXT: v_mul_hi_u32 v10, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s9, v3 +; GFX9-NEXT: v_mul_hi_u32 v12, s9, v4 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, s13, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, s9, v4 ; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, s12, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, s8, v4 ; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc @@ -1829,13 +1829,13 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc ; GFX9-NEXT: v_add3_u32 v9, v8, v7, v12 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v9, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v10, s13 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s12, v3 +; GFX9-NEXT: v_mov_b32_e32 v10, s9 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s8, v3 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v11, v[7:8] ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v8 -; GFX9-NEXT: v_sub_u32_e32 v7, s13, v7 +; GFX9-NEXT: v_sub_u32_e32 v7, s9, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] @@ -1867,7 +1867,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v3, v7, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[16:17] +; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] ; GFX9-NEXT: v_xor_b32_e32 v3, s0, v10 ; GFX9-NEXT: v_xor_b32_e32 v4, s1, v9 ; GFX9-NEXT: v_mov_b32_e32 v9, s1 @@ -1878,45 +1878,45 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mov_b32_e32 v9, s6 ; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v9, vcc -; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[8:9] -; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[10:11] +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[12:13] +; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 +; GFX10-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s16, s1, 31 -; GFX10-NEXT: s_ashr_i32 s4, s13, 31 -; GFX10-NEXT: s_mov_b32 s17, s16 -; GFX10-NEXT: s_add_u32 s12, s12, s4 -; GFX10-NEXT: s_addc_u32 s13, s13, s4 -; GFX10-NEXT: s_add_u32 s0, s0, s16 -; GFX10-NEXT: s_addc_u32 s1, s1, s16 +; GFX10-NEXT: s_ashr_i32 s8, s1, 31 +; GFX10-NEXT: s_ashr_i32 s4, s17, 31 +; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_add_u32 s10, s16, s4 +; GFX10-NEXT: s_addc_u32 s11, s17, s4 +; GFX10-NEXT: s_add_u32 s0, s0, s8 +; GFX10-NEXT: s_addc_u32 s1, s1, s8 ; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_xor_b64 s[6:7], s[0:1], s[16:17] -; GFX10-NEXT: s_xor_b64 s[0:1], s[12:13], s[4:5] +; GFX10-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] +; GFX10-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX10-NEXT: s_sub_u32 s21, 0, s6 ; GFX10-NEXT: s_subb_u32 s20, 0, s7 -; GFX10-NEXT: s_ashr_i32 s12, s15, 31 -; GFX10-NEXT: s_xor_b64 s[18:19], s[4:5], s[16:17] -; GFX10-NEXT: s_ashr_i32 s16, s3, 31 -; GFX10-NEXT: s_add_u32 s14, s14, s12 -; GFX10-NEXT: s_addc_u32 s15, s15, s12 +; GFX10-NEXT: s_xor_b64 s[16:17], s[4:5], s[8:9] +; GFX10-NEXT: s_ashr_i32 s8, s19, 31 +; GFX10-NEXT: s_ashr_i32 s10, s3, 31 +; GFX10-NEXT: s_add_u32 s18, s18, s8 +; GFX10-NEXT: s_addc_u32 s19, s19, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX10-NEXT: s_add_u32 s2, s2, s16 -; GFX10-NEXT: s_mov_b32 s17, s16 -; GFX10-NEXT: s_addc_u32 s3, s3, s16 -; GFX10-NEXT: s_mov_b32 s13, s12 -; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] +; GFX10-NEXT: s_add_u32 s2, s2, s10 +; GFX10-NEXT: s_mov_b32 s11, s10 +; GFX10-NEXT: s_addc_u32 s3, s3, s10 +; GFX10-NEXT: s_mov_b32 s9, s8 +; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GFX10-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] +; GFX10-NEXT: s_xor_b64 s[18:19], s[18:19], s[8:9] ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 @@ -2052,17 +2052,17 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_add_co_u32 v3, s5, v12, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v7 -; GFX10-NEXT: v_mul_lo_u32 v0, s15, v2 -; GFX10-NEXT: v_mul_lo_u32 v12, s14, v8 +; GFX10-NEXT: v_mul_lo_u32 v0, s19, v2 +; GFX10-NEXT: v_mul_lo_u32 v12, s18, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v4 -; GFX10-NEXT: v_mul_hi_u32 v9, s14, v2 +; GFX10-NEXT: v_mul_hi_u32 v9, s18, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_mul_hi_u32 v2, s15, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s15, v8 +; GFX10-NEXT: v_mul_hi_u32 v2, s19, v2 +; GFX10-NEXT: v_mul_lo_u32 v7, s19, v8 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v6, v1 ; GFX10-NEXT: v_add_co_u32 v6, s5, v0, v12 -; GFX10-NEXT: v_mul_hi_u32 v13, s14, v8 +; GFX10-NEXT: v_mul_hi_u32 v13, s18, v8 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v12, s5, v3, v1 @@ -2076,7 +2076,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 ; GFX10-NEXT: v_add3_u32 v4, v4, v7, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v6, v11, v6 -; GFX10-NEXT: v_mul_hi_u32 v5, s15, v8 +; GFX10-NEXT: v_mul_hi_u32 v5, s19, v8 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v12, 1 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v2 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s6, v4, v[1:2] @@ -2119,25 +2119,25 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v7, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v16, v9, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s14, v2 +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s18, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s1, s15, v0, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s15, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s1, s19, v0, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v14, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v8 -; GFX10-NEXT: v_xor_b32_e32 v1, s18, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s16, v1 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v0, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_xor_b32_e32 v4, s19, v4 +; GFX10-NEXT: v_xor_b32_e32 v4, s17, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 ; GFX10-NEXT: v_xor_b32_e32 v3, s4, v3 ; GFX10-NEXT: v_xor_b32_e32 v7, s4, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v2, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v11, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v0, s0, v1, s18 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s19, v4, s0 +; GFX10-NEXT: v_sub_co_u32 v0, s0, v1, s16 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v4, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v8 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v11, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v9, v12, s0 @@ -2163,19 +2163,19 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v5, v15, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s0 -; GFX10-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17] +; GFX10-NEXT: s_xor_b64 s[0:1], s[8:9], s[10:11] ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s4 ; GFX10-NEXT: v_xor_b32_e32 v3, s0, v6 ; GFX10-NEXT: v_xor_b32_e32 v6, s1, v11 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v7, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v7, s12, v2 -; GFX10-NEXT: v_xor_b32_e32 v8, s12, v8 +; GFX10-NEXT: v_xor_b32_e32 v7, s8, v2 +; GFX10-NEXT: v_xor_b32_e32 v8, s8, v8 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, s0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s12 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v8, vcc_lo -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[8:9] -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[10:11] +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v8, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[12:13] +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[14:15] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 @@ -2187,55 +2187,55 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) { ; GFX8-LABEL: sdiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_xor_b32 s6, s0, s5 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX8-NEXT: s_sub_i32 s0, 0, s6 ; GFX8-NEXT: s_sext_i32_i8 s4, s4 +; GFX8-NEXT: s_ashr_i32 s7, s4, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_add_i32 s4, s4, s7 +; GFX8-NEXT: s_xor_b32 s4, s4, s7 +; GFX8-NEXT: s_xor_b32 s5, s7, s5 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s8 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s6 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 ; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x80008 @@ -2245,21 +2245,21 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_sext_i32_i8 s0, s0 -; GFX9-NEXT: s_ashr_i32 s8, s0, 31 +; GFX9-NEXT: s_ashr_i32 s6, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s0, s0, s8 -; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_xor_b32 s7, s0, s6 +; GFX9-NEXT: s_xor_b32 s4, s6, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 @@ -2271,8 +2271,8 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v2, v0, s[0:1] ; GFX9-NEXT: global_store_byte v2, v1, s[2:3] @@ -2280,19 +2280,19 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX10-LABEL: sdiv_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80008 ; GFX10-NEXT: s_sext_i32_i8 s0, s0 ; GFX10-NEXT: s_ashr_i32 s4, s1, 31 -; GFX10-NEXT: s_ashr_i32 s8, s0, 31 +; GFX10-NEXT: s_ashr_i32 s6, s0, 31 ; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_add_i32 s0, s0, s8 +; GFX10-NEXT: s_add_i32 s0, s0, s6 ; GFX10-NEXT: s_xor_b32 s5, s1, s4 -; GFX10-NEXT: s_xor_b32 s0, s0, s8 +; GFX10-NEXT: s_xor_b32 s0, s0, s6 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX10-NEXT: s_sub_i32 s1, 0, s5 -; GFX10-NEXT: s_xor_b32 s4, s8, s4 +; GFX10-NEXT: s_xor_b32 s4, s6, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2303,7 +2303,7 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo @@ -2315,9 +2315,9 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s8, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s8, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s6, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_byte v2, v0, s[0:1] ; GFX10-NEXT: global_store_byte v2, v1, s[2:3] @@ -2332,46 +2332,46 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: sdivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s2, s[8:9], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: s_xor_b32 s8, s0, s3 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s4, 0, s8 +; GFX8-NEXT: s_xor_b32 s10, s0, s3 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX8-NEXT: s_sub_i32 s4, 0, s10 ; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80018 -; GFX8-NEXT: s_ashr_i32 s10, s1, 31 +; GFX8-NEXT: s_ashr_i32 s12, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s1, s1, s10 -; GFX8-NEXT: s_xor_b32 s11, s1, s10 +; GFX8-NEXT: s_add_i32 s1, s1, s12 +; GFX8-NEXT: s_xor_b32 s13, s1, s12 ; GFX8-NEXT: s_sext_i32_i8 s0, s2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX8-NEXT: s_ashr_i32 s9, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s13 +; GFX8-NEXT: s_ashr_i32 s11, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s11 ; GFX8-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX8-NEXT: s_xor_b32 s0, s0, s9 +; GFX8-NEXT: s_xor_b32 s0, s0, s11 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, s8 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s10 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s8, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s10, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s8, v2 -; GFX8-NEXT: s_sub_i32 s1, 0, s11 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s10, v2 +; GFX8-NEXT: s_sub_i32 s1, 0, s13 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1 ; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80008 @@ -2379,25 +2379,25 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX8-NEXT: s_xor_b32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s0, s9, s3 +; GFX8-NEXT: s_xor_b32 s0, s11, s3 ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, s9, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s11, v2 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, v1, s11 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s9, v2 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s13 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s11, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s13, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 -; GFX8-NEXT: s_xor_b32 s0, s2, s10 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s13, v3 +; GFX8-NEXT: s_xor_b32 s0, s2, s12 ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 @@ -2420,70 +2420,70 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80010 ; GFX9-NEXT: s_ashr_i32 s5, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s5 -; GFX9-NEXT: s_xor_b32 s8, s0, s5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_bfe_i32 s7, s4, 0x80018 -; GFX9-NEXT: s_ashr_i32 s9, s7, 31 +; GFX9-NEXT: s_xor_b32 s6, s0, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_bfe_i32 s8, s4, 0x80018 +; GFX9-NEXT: s_ashr_i32 s9, s8, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s7, s7, s9 -; GFX9-NEXT: s_xor_b32 s7, s7, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: s_xor_b32 s8, s8, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s10, 0, s8 +; GFX9-NEXT: s_sub_i32 s10, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i8 s6, s4 +; GFX9-NEXT: s_sext_i32_i8 s7, s4 ; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX9-NEXT: s_ashr_i32 s10, s6, 31 +; GFX9-NEXT: s_ashr_i32 s10, s7, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_add_i32 s6, s6, s10 -; GFX9-NEXT: s_xor_b32 s6, s6, s10 -; GFX9-NEXT: s_sub_i32 s11, 0, s7 +; GFX9-NEXT: s_add_i32 s7, s7, s10 +; GFX9-NEXT: s_xor_b32 s7, s7, s10 +; GFX9-NEXT: s_sub_i32 s11, 0, s8 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 ; GFX9-NEXT: s_bfe_i32 s4, s4, 0x80008 ; GFX9-NEXT: s_ashr_i32 s11, s4, 31 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX9-NEXT: s_add_i32 s4, s4, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: s_xor_b32 s4, s4, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: s_xor_b32 s5, s10, s5 ; GFX9-NEXT: v_xor_b32_e32 v0, s5, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: s_xor_b32 s4, s11, s9 ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 ; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 @@ -2505,16 +2505,16 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80018 ; GFX10-NEXT: s_bfe_i32 s3, s0, 0x80010 ; GFX10-NEXT: s_ashr_i32 s2, s1, 31 -; GFX10-NEXT: s_ashr_i32 s8, s3, 31 +; GFX10-NEXT: s_ashr_i32 s10, s3, 31 ; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s3, s3, s8 +; GFX10-NEXT: s_add_i32 s3, s3, s10 ; GFX10-NEXT: s_xor_b32 s1, s1, s2 -; GFX10-NEXT: s_xor_b32 s3, s3, s8 +; GFX10-NEXT: s_xor_b32 s3, s3, s10 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX10-NEXT: s_sub_i32 s4, 0, s1 @@ -2529,14 +2529,14 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX10-NEXT: s_bfe_i32 s4, s0, 0x80008 ; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_ashr_i32 s9, s4, 31 -; GFX10-NEXT: s_ashr_i32 s10, s0, 31 +; GFX10-NEXT: s_ashr_i32 s11, s4, 31 +; GFX10-NEXT: s_ashr_i32 s12, s0, 31 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_add_i32 s4, s4, s9 -; GFX10-NEXT: s_add_i32 s0, s0, s10 +; GFX10-NEXT: s_add_i32 s4, s4, s11 +; GFX10-NEXT: s_add_i32 s0, s0, s12 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s4, s4, s9 -; GFX10-NEXT: s_xor_b32 s0, s0, s10 +; GFX10-NEXT: s_xor_b32 s4, s4, s11 +; GFX10-NEXT: s_xor_b32 s0, s0, s12 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -2546,7 +2546,7 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 @@ -2564,20 +2564,20 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: s_xor_b32 s1, s9, s2 +; GFX10-NEXT: s_xor_b32 s1, s11, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 -; GFX10-NEXT: s_xor_b32 s0, s10, s8 +; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2 +; GFX10-NEXT: s_xor_b32 s0, s12, s10 ; GFX10-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s10, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3 ; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2596,55 +2596,55 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) { ; GFX8-LABEL: sdiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_xor_b32 s6, s0, s5 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX8-NEXT: s_sub_i32 s0, 0, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_ashr_i32 s7, s4, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_add_i32 s4, s4, s7 +; GFX8-NEXT: s_xor_b32 s4, s4, s7 +; GFX8-NEXT: s_xor_b32 s5, s7, s5 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s8 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s6 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 ; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x100010 @@ -2654,21 +2654,21 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s8, s0, 31 +; GFX9-NEXT: s_ashr_i32 s6, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s0, s0, s8 -; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_xor_b32 s7, s0, s6 +; GFX9-NEXT: s_xor_b32 s4, s6, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 @@ -2680,8 +2680,8 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v2, v0, s[0:1] ; GFX9-NEXT: global_store_short v2, v1, s[2:3] @@ -2689,19 +2689,19 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX10-LABEL: sdiv_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x100010 ; GFX10-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-NEXT: s_ashr_i32 s4, s1, 31 -; GFX10-NEXT: s_ashr_i32 s8, s0, 31 +; GFX10-NEXT: s_ashr_i32 s6, s0, 31 ; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_add_i32 s0, s0, s8 +; GFX10-NEXT: s_add_i32 s0, s0, s6 ; GFX10-NEXT: s_xor_b32 s5, s1, s4 -; GFX10-NEXT: s_xor_b32 s0, s0, s8 +; GFX10-NEXT: s_xor_b32 s0, s0, s6 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX10-NEXT: s_sub_i32 s1, 0, s5 -; GFX10-NEXT: s_xor_b32 s4, s8, s4 +; GFX10-NEXT: s_xor_b32 s4, s6, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2712,7 +2712,7 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo @@ -2724,9 +2724,9 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s8, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s8, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s6, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_short v2, v0, s[0:1] ; GFX10-NEXT: global_store_short v2, v1, s[2:3] @@ -2741,46 +2741,46 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: sdivrem_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s0, s3 -; GFX8-NEXT: s_ashr_i32 s8, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s8 -; GFX8-NEXT: s_xor_b32 s9, s0, s8 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX8-NEXT: s_sub_i32 s4, 0, s9 +; GFX8-NEXT: s_ashr_i32 s10, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s10 +; GFX8-NEXT: s_xor_b32 s11, s0, s10 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 +; GFX8-NEXT: s_sub_i32 s4, 0, s11 ; GFX8-NEXT: s_bfe_i32 s1, s3, 0x100010 -; GFX8-NEXT: s_ashr_i32 s10, s1, 31 +; GFX8-NEXT: s_ashr_i32 s12, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s1, s1, s10 -; GFX8-NEXT: s_xor_b32 s11, s1, s10 +; GFX8-NEXT: s_add_i32 s1, s1, s12 +; GFX8-NEXT: s_xor_b32 s13, s1, s12 ; GFX8-NEXT: s_sext_i32_i16 s0, s2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX8-NEXT: s_xor_b32 s0, s0, s3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, s9 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s11 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s9, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s11, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s9, v2 -; GFX8-NEXT: s_sub_i32 s1, 0, s11 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s11, v2 +; GFX8-NEXT: s_sub_i32 s1, 0, s13 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1 ; GFX8-NEXT: s_bfe_i32 s1, s2, 0x100010 @@ -2788,25 +2788,25 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX8-NEXT: s_xor_b32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s0, s3, s8 +; GFX8-NEXT: s_xor_b32 s0, s3, s10 ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX8-NEXT: v_xor_b32_e32 v2, s3, v2 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, v1, s11 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s13 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s3, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s13, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 -; GFX8-NEXT: s_xor_b32 s0, s2, s10 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s13, v3 +; GFX8-NEXT: s_xor_b32 s0, s2, s12 ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 @@ -2829,58 +2829,58 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s0, s5 -; GFX9-NEXT: s_ashr_i32 s8, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s8 -; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX9-NEXT: s_ashr_i32 s6, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_xor_b32 s7, s0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX9-NEXT: s_bfe_i32 s5, s5, 0x100010 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_ashr_i32 s7, s5, 31 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_ashr_i32 s9, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s5, s5, s7 -; GFX9-NEXT: s_xor_b32 s5, s5, s7 +; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_xor_b32 s5, s5, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s10, 0, s9 +; GFX9-NEXT: s_sub_i32 s10, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i16 s6, s4 +; GFX9-NEXT: s_sext_i32_i16 s8, s4 ; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX9-NEXT: s_ashr_i32 s10, s6, 31 +; GFX9-NEXT: s_ashr_i32 s10, s8, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_add_i32 s6, s6, s10 -; GFX9-NEXT: s_xor_b32 s6, s6, s10 +; GFX9-NEXT: s_add_i32 s8, s8, s10 +; GFX9-NEXT: s_xor_b32 s8, s8, s10 ; GFX9-NEXT: s_sub_i32 s11, 0, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 ; GFX9-NEXT: s_bfe_i32 s4, s4, 0x100010 ; GFX9-NEXT: s_ashr_i32 s11, s4, 31 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s9 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX9-NEXT: s_add_i32 s4, s4, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: s_xor_b32 s4, s4, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s6, s10, s8 +; GFX9-NEXT: s_xor_b32 s6, s10, s6 ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 @@ -2891,7 +2891,7 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 -; GFX9-NEXT: s_xor_b32 s4, s11, s7 +; GFX9-NEXT: s_xor_b32 s4, s11, s9 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -2912,16 +2912,16 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s1 ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x100010 ; GFX10-NEXT: s_ashr_i32 s3, s2, 31 -; GFX10-NEXT: s_ashr_i32 s8, s1, 31 +; GFX10-NEXT: s_ashr_i32 s10, s1, 31 ; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_add_i32 s1, s1, s8 +; GFX10-NEXT: s_add_i32 s1, s1, s10 ; GFX10-NEXT: s_xor_b32 s2, s2, s3 -; GFX10-NEXT: s_xor_b32 s1, s1, s8 +; GFX10-NEXT: s_xor_b32 s1, s1, s10 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s4, 0, s2 @@ -2936,14 +2936,14 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX10-NEXT: s_sext_i32_i16 s4, s0 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX10-NEXT: s_ashr_i32 s9, s4, 31 -; GFX10-NEXT: s_ashr_i32 s10, s0, 31 +; GFX10-NEXT: s_ashr_i32 s11, s4, 31 +; GFX10-NEXT: s_ashr_i32 s12, s0, 31 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_add_i32 s4, s4, s9 -; GFX10-NEXT: s_add_i32 s0, s0, s10 +; GFX10-NEXT: s_add_i32 s4, s4, s11 +; GFX10-NEXT: s_add_i32 s0, s0, s12 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s4, s4, s9 -; GFX10-NEXT: s_xor_b32 s0, s0, s10 +; GFX10-NEXT: s_xor_b32 s4, s4, s11 +; GFX10-NEXT: s_xor_b32 s0, s0, s12 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -2953,7 +2953,7 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 @@ -2971,19 +2971,19 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: s_xor_b32 s1, s9, s3 +; GFX10-NEXT: s_xor_b32 s1, s11, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s0 -; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 -; GFX10-NEXT: s_xor_b32 s0, s10, s8 +; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2 +; GFX10-NEXT: s_xor_b32 s0, s12, s10 ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s10, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3002,47 +3002,47 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) { ; GFX8-LABEL: sdivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_xor_b32 s6, s0, s5 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX8-NEXT: s_sub_i32 s0, 0, s6 ; GFX8-NEXT: s_bfe_i32 s4, s4, 0x30000 +; GFX8-NEXT: s_ashr_i32 s7, s4, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_add_i32 s4, s4, s7 +; GFX8-NEXT: s_xor_b32 s4, s4, s7 +; GFX8-NEXT: s_xor_b32 s5, s7, s5 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s8 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s6 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 ; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 @@ -3052,7 +3052,7 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX9-LABEL: sdivrem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x30008 @@ -3062,21 +3062,21 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x30000 -; GFX9-NEXT: s_ashr_i32 s8, s0, 31 +; GFX9-NEXT: s_ashr_i32 s6, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s0, s0, s8 -; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_xor_b32 s7, s0, s6 +; GFX9-NEXT: s_xor_b32 s4, s6, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 @@ -3088,8 +3088,8 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v2, v0, s[0:1] @@ -3099,7 +3099,7 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX10-LABEL: sdivrem_i3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x30008 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x30000 @@ -3129,7 +3129,7 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -3153,47 +3153,47 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) { ; GFX8-LABEL: sdivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_xor_b32 s6, s0, s5 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX8-NEXT: s_sub_i32 s0, 0, s6 ; GFX8-NEXT: s_bfe_i32 s4, s4, 0x1b0000 +; GFX8-NEXT: s_ashr_i32 s7, s4, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_add_i32 s4, s4, s7 +; GFX8-NEXT: s_xor_b32 s4, s4, s7 +; GFX8-NEXT: s_xor_b32 s5, s7, s5 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s8 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s6 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 ; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 @@ -3203,7 +3203,7 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i27: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s1, 0x1b0000 @@ -3213,21 +3213,21 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX9-NEXT: s_sub_i32 s1, 0, s5 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x1b0000 -; GFX9-NEXT: s_ashr_i32 s8, s0, 31 +; GFX9-NEXT: s_ashr_i32 s6, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s0, s0, s8 -; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_xor_b32 s7, s0, s6 +; GFX9-NEXT: s_xor_b32 s4, s6, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 @@ -3239,8 +3239,8 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -3250,7 +3250,7 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i27: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x1b0000 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x1b0000 @@ -3280,7 +3280,7 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index adbe92fdbc6252..be8cb232931766 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -238,7 +238,7 @@ define i64 @v_shl_i64_sext_i32_overflow(i32 %x) { define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GFX7-LABEL: mulu24_shl64: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 @@ -251,7 +251,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX8-LABEL: mulu24_shl64: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -266,7 +266,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: mulu24_shl64: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -281,7 +281,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX10-LABEL: mulu24_shl64: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mul_u32_u24_e32 v0, 7, v0 @@ -296,7 +296,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX11-LABEL: mulu24_shl64: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 6, v0 ; GFX11-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] @@ -319,7 +319,7 @@ bb: define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture readonly %arg1) { ; GFX7-LABEL: muli24_shl64: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -338,7 +338,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX8-LABEL: muli24_shl64: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -361,7 +361,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX9-LABEL: muli24_shl64: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -376,7 +376,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX10-LABEL: muli24_shl64: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -391,7 +391,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX11-LABEL: muli24_shl64: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll index 8ea5fc25d95de7..9ee0acf2aa2db6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -92,8 +92,8 @@ entry: ; GCN-LABEL: {{^}}smrd6: ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4 ; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0 -; GFX9_10: s_add_u32 s0, s6, -4 -; GFX9_10: s_addc_u32 s1, s7, -1 +; GFX9_10: s_add_u32 s2, s2, -4 +; GFX9_10: s_addc_u32 s3, s3, -1 ; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 2d85081f5fc969..e81bae5d3a416b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -10,54 +10,54 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: ds_write_b128 v4, v[0:3] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s6 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX11-NEXT: v_mov_b32_e32 v4, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: ds_store_b128 v4, v[0:3] ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out @@ -67,38 +67,38 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s2, 8 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s0 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s5 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: s_lshr_b32 s1, s1, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: s_lshr_b32 s0, s5, 16 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX9-NEXT: s_lshr_b32 s4, s4, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 ; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s6 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshr_b32 s0, s6, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 @@ -106,11 +106,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s0, s7, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_lshr_b32 s0, s3, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 @@ -123,51 +123,51 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s1, s4, 16 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_lshr_b32 s5, s0, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX7-NEXT: s_lshr_b32 s0, s4, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX7-NEXT: s_bfe_u32 s1, s5, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: s_lshr_b32 s0, s5, 16 -; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX7-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s0, s1, 16 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX7-NEXT: s_lshr_b32 s1, s5, 24 +; GFX7-NEXT: s_lshr_b32 s1, s1, 24 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX7-NEXT: s_bfe_u32 s1, s6, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshr_b32 s0, s6, 16 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s0, s2, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX7-NEXT: s_lshr_b32 s1, s6, 24 +; GFX7-NEXT: s_lshr_b32 s1, s2, 24 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 -; GFX7-NEXT: s_bfe_u32 s1, s7, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: s_lshr_b32 s0, s7, 16 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_lshr_b32 s0, s3, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:12 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:13 -; GFX7-NEXT: s_lshr_b32 s1, s7, 24 +; GFX7-NEXT: s_lshr_b32 s1, s3, 24 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:14 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 @@ -177,30 +177,30 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_lshr_b32 s0, s5, 16 -; GFX10-NEXT: s_and_b32 s3, 0xffff, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s2, s2, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: s_lshr_b32 s4, s6, 16 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s1, s3, 8 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: s_lshr_b32 s1, s2, 16 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_lshr_b32 s5, s4, 8 +; GFX10-NEXT: s_lshr_b32 s4, s6, 8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-NEXT: s_lshr_b32 s0, s5, 8 -; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s1 -; GFX10-NEXT: v_mov_b32_e32 v9, s3 +; GFX10-NEXT: s_lshr_b32 s0, s7, 8 +; GFX10-NEXT: v_mov_b32_e32 v7, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, s4 +; GFX10-NEXT: v_mov_b32_e32 v9, s6 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 @@ -208,18 +208,18 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: ds_write_b8 v1, v6 offset:1 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: v_mov_b32_e32 v10, s0 -; GFX10-NEXT: s_lshr_b32 s0, s4, 8 +; GFX10-NEXT: s_lshr_b32 s0, s1, 8 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_and_b32 s0, 0xffff, s7 -; GFX10-NEXT: s_lshr_b32 s1, s7, 16 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s3 +; GFX10-NEXT: s_lshr_b32 s1, s3, 16 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: s_lshr_b32 s0, s1, 8 ; GFX10-NEXT: v_mov_b32_e32 v4, s1 @@ -234,26 +234,26 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX11-NEXT: s_lshr_b32 s1, s4, 16 -; GFX11-NEXT: s_lshr_b32 s2, s2, 8 -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: s_lshr_b32 s0, s5, 16 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 -; GFX11-NEXT: s_lshr_b32 s4, s6, 16 -; GFX11-NEXT: s_and_b32 s5, 0xffff, s6 -; GFX11-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s0 +; GFX11-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: s_lshr_b32 s1, s2, 16 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s2 +; GFX11-NEXT: s_lshr_b32 s2, s6, 8 +; GFX11-NEXT: s_lshr_b32 s6, s5, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: s_lshr_b32 s1, s3, 8 -; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s0, s5, 8 -; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_lshr_b32 s4, s4, 8 +; GFX11-NEXT: s_lshr_b32 s5, s0, 8 +; GFX11-NEXT: s_lshr_b32 s0, s7, 8 +; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 ; GFX11-NEXT: ds_store_b8 v1, v0 ; GFX11-NEXT: ds_store_b8 v1, v6 offset:1 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 @@ -262,15 +262,15 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-NEXT: ds_store_b8 v1, v8 offset:5 ; GFX11-NEXT: ds_store_b8 v1, v5 offset:6 ; GFX11-NEXT: ds_store_b8 v1, v9 offset:7 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7 -; GFX11-NEXT: s_lshr_b32 s0, s4, 8 -; GFX11-NEXT: s_lshr_b32 s1, s7, 16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: s_lshr_b32 s0, s1, 8 +; GFX11-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-NEXT: v_mov_b32_e32 v4, s0 -; GFX11-NEXT: s_and_b32 s0, 0xffff, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v7, s1 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s3 +; GFX11-NEXT: s_lshr_b32 s1, s3, 16 ; GFX11-NEXT: s_lshr_b32 s0, s0, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 ; GFX11-NEXT: s_lshr_b32 s0, s1, 8 ; GFX11-NEXT: v_mov_b32_e32 v8, s0 ; GFX11-NEXT: ds_store_b8 v1, v3 offset:8 @@ -289,27 +289,27 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX9-NEXT: s_lshr_b32 s0, s5, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX9-NEXT: s_lshr_b32 s0, s6, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 -; GFX9-NEXT: s_lshr_b32 s0, s7, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_lshr_b32 s0, s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:14 @@ -317,28 +317,28 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s1, s4, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s5, s0, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: ds_write_b16 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX7-NEXT: s_lshr_b32 s0, s5, 16 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 +; GFX7-NEXT: s_lshr_b32 s0, s1, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX7-NEXT: s_lshr_b32 s0, s6, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:10 -; GFX7-NEXT: s_lshr_b32 s0, s7, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: s_lshr_b32 s0, s3, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:12 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:14 @@ -347,22 +347,22 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s0, s5, 16 -; GFX10-NEXT: s_lshr_b32 s2, s6, 16 -; GFX10-NEXT: s_lshr_b32 s3, s7, 16 -; GFX10-NEXT: v_mov_b32_e32 v4, s7 -; GFX10-NEXT: v_mov_b32_e32 v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-NEXT: s_lshr_b32 s1, s2, 16 +; GFX10-NEXT: s_lshr_b32 s2, s3, 16 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: v_mov_b32_e32 v6, s0 -; GFX10-NEXT: v_mov_b32_e32 v7, s2 -; GFX10-NEXT: v_mov_b32_e32 v8, s3 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: v_mov_b32_e32 v8, s2 ; GFX10-NEXT: ds_write_b16 v1, v0 ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b16 v1, v3 offset:8 @@ -376,18 +376,18 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s1, s4, 16 -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: s_lshr_b32 s0, s5, 16 -; GFX11-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s1 -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 -; GFX11-NEXT: s_lshr_b32 s2, s6, 16 -; GFX11-NEXT: s_lshr_b32 s3, s7, 16 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s2 -; GFX11-NEXT: v_mov_b32_e32 v8, s3 +; GFX11-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: s_lshr_b32 s1, s2, 16 +; GFX11-NEXT: s_lshr_b32 s2, s3, 16 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 +; GFX11-NEXT: v_mov_b32_e32 v8, s2 ; GFX11-NEXT: ds_store_b16 v1, v0 ; GFX11-NEXT: ds_store_b16 v1, v5 offset:2 ; GFX11-NEXT: ds_store_b16 v1, v2 offset:4 @@ -404,44 +404,44 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset0:2 offset1:3 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v4, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 ; GFX10-NEXT: ds_write2_b32 v1, v3, v4 offset0:2 offset1:3 ; GFX10-NEXT: s_endpgm @@ -449,12 +449,12 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 -; GFX11-NEXT: v_mov_b32_e32 v4, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v4, s3 ; GFX11-NEXT: ds_store_2addr_b32 v1, v0, v2 offset1:1 ; GFX11-NEXT: ds_store_2addr_b32 v1, v3, v4 offset0:2 offset1:3 ; GFX11-NEXT: s_endpgm @@ -465,42 +465,42 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v4, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 ; GFX10-NEXT: ds_write2_b32 v1, v3, v4 offset0:2 offset1:3 ; GFX10-NEXT: s_endpgm @@ -508,12 +508,12 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX11-NEXT: v_mov_b32_e32 v4, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out, align 8 @@ -523,54 +523,54 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: ds_write_b128 v4, v[0:3] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s6 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX11-NEXT: v_mov_b32_e32 v4, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: ds_store_b128 v4, v[0:3] ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index 4ef79b752c4373..030f01a8bd5eae 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -10,50 +10,54 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: ds_write_b96 v3, v[0:2] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: ds_store_b96 v3, v[0:2] ; GFX11-NEXT: s_endpgm store <3 x i32> %x, ptr addrspace(3) %out @@ -63,38 +67,40 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s2, 8 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s0 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s5 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: s_lshr_b32 s1, s1, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: s_lshr_b32 s0, s5, 16 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX9-NEXT: s_lshr_b32 s3, s3, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 ; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s6 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshr_b32 s0, s6, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 @@ -107,40 +113,41 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s1, s4, 16 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s4, s0, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX7-NEXT: s_lshr_b32 s0, s4, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX7-NEXT: s_bfe_u32 s1, s5, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: s_lshr_b32 s0, s5, 16 -; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX7-NEXT: s_bfe_u32 s3, s1, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s0, s1, 16 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX7-NEXT: s_lshr_b32 s1, s5, 24 +; GFX7-NEXT: s_lshr_b32 s1, s1, 24 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX7-NEXT: s_bfe_u32 s1, s6, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshr_b32 s0, s6, 16 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s0, s2, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX7-NEXT: s_lshr_b32 s1, s6, 24 +; GFX7-NEXT: s_lshr_b32 s1, s2, 24 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 @@ -149,33 +156,34 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX10-LABEL: store_lds_v3i32_align1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_lshr_b32 s0, s5, 16 -; GFX10-NEXT: s_and_b32 s3, 0xffff, s5 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s4, s6, 16 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: s_lshr_b32 s1, s3, 8 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s2, s2, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: s_lshr_b32 s1, s2, 16 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: s_lshr_b32 s5, s4, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_lshr_b32 s4, s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: s_lshr_b32 s0, s5, 8 -; GFX10-NEXT: v_mov_b32_e32 v9, s3 +; GFX10-NEXT: s_lshr_b32 s0, s6, 8 +; GFX10-NEXT: v_mov_b32_e32 v9, s4 +; GFX10-NEXT: s_lshr_b32 s3, s3, 8 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: v_mov_b32_e32 v10, s0 -; GFX10-NEXT: s_lshr_b32 s0, s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s1 +; GFX10-NEXT: s_lshr_b32 s0, s1, 8 +; GFX10-NEXT: v_mov_b32_e32 v7, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, s3 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 @@ -183,7 +191,7 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX10-NEXT: ds_write_b8 v1, v6 offset:1 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 @@ -194,29 +202,29 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX11-LABEL: store_lds_v3i32_align1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX11-NEXT: s_lshr_b32 s1, s4, 16 -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: s_lshr_b32 s4, s6, 16 -; GFX11-NEXT: s_lshr_b32 s2, s2, 8 -; GFX11-NEXT: s_lshr_b32 s0, s5, 16 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 -; GFX11-NEXT: s_and_b32 s5, 0xffff, s6 -; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: s_lshr_b32 s1, s3, 8 -; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s0, s5, 8 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s0 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: s_lshr_b32 s1, s2, 16 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s2 +; GFX11-NEXT: s_lshr_b32 s2, s5, 8 ; GFX11-NEXT: s_lshr_b32 s5, s4, 8 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s1 -; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v12, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_lshr_b32 s3, s3, 8 +; GFX11-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-NEXT: s_lshr_b32 s0, s6, 8 +; GFX11-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s3 +; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v12, s6 ; GFX11-NEXT: ds_store_b8 v1, v0 ; GFX11-NEXT: ds_store_b8 v1, v7 offset:1 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 @@ -237,22 +245,24 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX9-NEXT: s_lshr_b32 s0, s5, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX9-NEXT: s_lshr_b32 s0, s6, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 @@ -260,23 +270,24 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_lshr_b32 s4, s0, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s1, s4, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: ds_write_b16 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX7-NEXT: s_lshr_b32 s0, s5, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_lshr_b32 s0, s1, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX7-NEXT: s_lshr_b32 s0, s6, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:10 @@ -284,20 +295,21 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX10-LABEL: store_lds_v3i32_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s0, s5, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s2, s6, 16 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: s_lshr_b32 s1, s2, 16 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: ds_write_b16 v1, v0 ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b16 v1, v3 offset:8 @@ -308,17 +320,17 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX11-LABEL: store_lds_v3i32_align2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s1, s4, 16 -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: s_lshr_b32 s0, s5, 16 -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: s_lshr_b32 s2, s6, 16 -; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_mov_b32_e32 v6, s2 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: s_lshr_b32 s1, s2, 16 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_mov_b32_e32 v6, s1 ; GFX11-NEXT: ds_store_b16 v1, v0 ; GFX11-NEXT: ds_store_b16 v1, v3 offset:2 ; GFX11-NEXT: ds_store_b16 v1, v2 offset:4 @@ -333,53 +345,57 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 ; GFX9-NEXT: ds_write_b32 v1, v3 offset:8 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b32 v1, v0 offset:8 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 ; GFX10-NEXT: ds_write_b32 v1, v3 offset:8 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: ds_store_2addr_b32 v1, v0, v2 offset1:1 ; GFX11-NEXT: ds_store_b32 v1, v3 offset:8 ; GFX11-NEXT: s_endpgm @@ -390,53 +406,57 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 ; GFX9-NEXT: ds_write_b32 v1, v3 offset:8 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b32 v1, v0 offset:8 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 ; GFX10-NEXT: ds_write_b32 v1, v3 offset:8 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: ds_store_2addr_b32 v1, v0, v2 offset1:1 ; GFX11-NEXT: ds_store_b32 v1, v3 offset:8 ; GFX11-NEXT: s_endpgm @@ -447,50 +467,54 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: ds_write_b96 v3, v[0:2] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: ds_store_b96 v3, v[0:2] ; GFX11-NEXT: s_endpgm store <3 x i32> %x, ptr addrspace(3) %out, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 5aef6679347094..ffebde52df4a3e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: udivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX8-NEXT: s_sub_i32 s0, 0, s5 @@ -14,7 +14,7 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 @@ -41,7 +41,7 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 @@ -50,7 +50,7 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -73,7 +73,7 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX10-NEXT: s_sub_i32 s0, 0, s5 @@ -81,7 +81,7 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -112,7 +112,7 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: udivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -251,12 +251,12 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14 -; GFX9-NEXT: s_sub_u32 s2, 0, s14 -; GFX9-NEXT: s_subb_u32 s3, 0, s15 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s19 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s18 +; GFX9-NEXT: s_sub_u32 s2, 0, s18 +; GFX9-NEXT: s_subb_u32 s3, 0, s19 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -293,7 +293,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] @@ -318,52 +318,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 +; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v5, 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v6, s17 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v5, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s12, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s16, v0 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v6 -; GFX9-NEXT: v_sub_u32_e32 v0, s13, v1 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v6 +; GFX9-NEXT: v_sub_u32_e32 v0, s17, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s14, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s18, v2 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc ; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s14, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s18, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc @@ -378,17 +378,17 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v15, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v5, s[0:1] -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s15 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s14 -; GFX10-NEXT: s_sub_u32 s0, 0, s14 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s19 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s18 +; GFX10-NEXT: s_sub_u32 s0, 0, s18 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -401,7 +401,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s0, v3, 0 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s0, v4, v[1:2] -; GFX10-NEXT: s_subb_u32 s1, 0, s15 +; GFX10-NEXT: s_subb_u32 s1, 0, s19 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, s1, v3, v[1:2] ; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0 @@ -449,14 +449,14 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0 ; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX10-NEXT: v_mul_lo_u32 v2, s17, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo -; GFX10-NEXT: v_mul_hi_u32 v4, s12, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX10-NEXT: v_mul_lo_u32 v5, s13, v1 +; GFX10-NEXT: v_mul_hi_u32 v4, s16, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s17, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, s16, v1 +; GFX10-NEXT: v_mul_lo_u32 v5, s17, v1 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v3 -; GFX10-NEXT: v_mul_hi_u32 v3, s12, v1 +; GFX10-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 @@ -466,38 +466,38 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v5, s0, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s13, v1 +; GFX10-NEXT: v_mul_hi_u32 v2, s17, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s14, v5, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s18, v5, 0 ; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s14, v3, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s15, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s18, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s19, v5, v[1:2] ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s12, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s13, v1 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s13, v1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v6, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v7 +; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s16, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s17, v1 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s17, v1, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s14 +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v8 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v0, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s19, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v9 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v13, s0, v2, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s19, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v8 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s19, v8 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v10, v1, s0 -; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s14 +; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s18 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo @@ -509,8 +509,8 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v9, s0 -; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[8:9] -; GFX10-NEXT: global_store_dwordx2 v10, v[2:3], s[10:11] +; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[12:13] +; GFX10-NEXT: global_store_dwordx2 v10, v[2:3], s[14:15] ; GFX10-NEXT: s_endpgm %div = udiv i64 %x, %y store i64 %div, ptr addrspace(1) %out0 @@ -522,7 +522,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 @@ -576,12 +576,12 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX9-NEXT: s_sub_i32 s0, 0, s14 -; GFX9-NEXT: s_sub_i32 s1, 0, s15 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s18 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s19 +; GFX9-NEXT: s_sub_i32 s0, 0, s18 +; GFX9-NEXT: s_sub_i32 s1, 0, s19 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -593,47 +593,47 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s13, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s14 +; GFX9-NEXT: v_mul_hi_u32 v1, s17, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s18 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s15 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s19 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s12, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3 +; GFX9-NEXT: v_sub_u32_e32 v2, s16, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s18, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s17, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s18, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_subrev_u32_e32 v5, s15, v3 +; GFX9-NEXT: v_subrev_u32_e32 v5, s19, v3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s18, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s18, v2 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, s15, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s19, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s19, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX10-NEXT: s_sub_i32 s0, 0, s14 -; GFX10-NEXT: s_sub_i32 s1, 0, s15 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s18 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s19 +; GFX10-NEXT: s_sub_i32 s0, 0, s18 +; GFX10-NEXT: s_sub_i32 s1, 0, s19 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -646,34 +646,34 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s12, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s13, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s14 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s15 +; GFX10-NEXT: v_mul_hi_u32 v0, s16, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, s17, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s18 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s19 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s12, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s13, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s16, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s17, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s18, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s19, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s19, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s18, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s19, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s18, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s19, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[8:9] -; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[10:11] +; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[12:13] +; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[14:15] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i32> %x, %y store <2 x i32> %div, ptr addrspace(1) %out0 @@ -685,13 +685,13 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX8-NEXT: s_sub_i32 s0, 0, s12 -; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s14 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s16 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s17 +; GFX8-NEXT: s_sub_i32 s0, 0, s16 +; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s18 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -699,78 +699,78 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX8-NEXT: s_sub_i32 s0, 0, s13 +; GFX8-NEXT: s_sub_i32 s0, 0, s17 ; GFX8-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, s12 +; GFX8-NEXT: v_mul_hi_u32 v1, s13, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s16 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v1, s13 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 +; GFX8-NEXT: v_mul_lo_u32 v5, v1, s17 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s12, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s12, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s12, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v6 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s9, v5 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s13, v5 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s17, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2 -; GFX8-NEXT: s_sub_i32 s0, 0, s14 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s17, v2 +; GFX8-NEXT: s_sub_i32 s0, 0, s18 ; GFX8-NEXT: v_mul_lo_u32 v6, s0, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s17, v2 ; GFX8-NEXT: v_mul_hi_u32 v6, v3, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s15 +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s19 ; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v6 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v3, s10, v3 -; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, s14, v3 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s17, v2 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX8-NEXT: s_sub_i32 s0, 0, s15 +; GFX8-NEXT: s_sub_i32 s0, 0, s19 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v2, v5, vcc -; GFX8-NEXT: v_mul_lo_u32 v2, v3, s14 +; GFX8-NEXT: v_mul_lo_u32 v2, v3, s18 ; GFX8-NEXT: v_mul_lo_u32 v7, s0, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v3 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s10, v2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s14, v2 ; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s18, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s14, v2 +; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s18, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v2, v8, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, s11, v2 +; GFX8-NEXT: v_mul_hi_u32 v7, s15, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s14, v8 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s18, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v7, s15 -; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s14, v8 +; GFX8-NEXT: v_mul_lo_u32 v3, v7, s19 +; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s18, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s11, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s15, v3 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v7 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s15, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s19, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s15, v3 +; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s19, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v7 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s15, v8 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s19, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s15, v8 +; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s19, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_mov_b32_e32 v8, s4 @@ -783,116 +783,115 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_sub_i32 s0, 0, s12 -; GFX9-NEXT: s_sub_i32 s1, 0, s13 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: s_sub_i32 s10, 0, s4 +; GFX9-NEXT: s_sub_i32 s11, 0, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s14 -; GFX9-NEXT: s_sub_i32 s4, 0, s14 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GFX9-NEXT: s_sub_i32 s12, 0, s6 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s11, v1 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX9-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s12 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s13 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s5 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_sub_u32_e32 v7, s9, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s0, v3 +; GFX9-NEXT: v_sub_u32_e32 v7, s1, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s4, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s15 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v7 +; GFX9-NEXT: v_mul_lo_u32 v3, s12, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX9-NEXT: v_subrev_u32_e32 v6, s13, v7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v5 -; GFX9-NEXT: v_mul_hi_u32 v2, s10, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, s2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v6 -; GFX9-NEXT: s_sub_i32 s4, 0, s15 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v6 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v2, s14 -; GFX9-NEXT: v_mul_lo_u32 v8, s4, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v6 +; GFX9-NEXT: v_mul_lo_u32 v7, v2, s6 +; GFX9-NEXT: v_mul_lo_u32 v8, s0, v3 +; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, s10, v7 +; GFX9-NEXT: v_sub_u32_e32 v6, s2, v7 ; GFX9-NEXT: v_mul_hi_u32 v7, v3, v8 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v6 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 -; GFX9-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX9-NEXT: v_subrev_u32_e32 v7, s14, v6 +; GFX9-NEXT: v_mul_hi_u32 v3, s3, v3 +; GFX9-NEXT: v_subrev_u32_e32 v7, s6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 -; GFX9-NEXT: v_mul_lo_u32 v8, v3, s15 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, v3, s7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v7, s14, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s6, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-NEXT: v_sub_u32_e32 v7, s11, v8 +; GFX9-NEXT: v_sub_u32_e32 v7, s3, v8 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s15, v7 +; GFX9-NEXT: v_subrev_u32_e32 v8, s7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s15, v7 +; GFX9-NEXT: v_subrev_u32_e32 v8, s7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] -; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s14 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s15 -; GFX10-NEXT: s_sub_i32 s0, 0, s12 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX10-NEXT: s_sub_i32 s10, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: s_sub_i32 s1, 0, s13 -; GFX10-NEXT: s_sub_i32 s2, 0, s14 +; GFX10-NEXT: s_sub_i32 s11, 0, s5 +; GFX10-NEXT: s_sub_i32 s12, 0, s6 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 @@ -901,11 +900,12 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v5, s1, v1 -; GFX10-NEXT: v_mul_lo_u32 v6, s2, v2 -; GFX10-NEXT: s_sub_i32 s0, 0, s15 -; GFX10-NEXT: v_mul_lo_u32 v7, s0, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, s10, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s11, v1 +; GFX10-NEXT: v_mul_lo_u32 v6, s12, v2 +; GFX10-NEXT: s_sub_i32 s10, 0, s7 +; GFX10-NEXT: v_mul_lo_u32 v7, s10, v3 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 @@ -914,34 +914,34 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v6 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v7 -; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX10-NEXT: v_mul_hi_u32 v2, s10, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, v0, s12 -; GFX10-NEXT: v_mul_lo_u32 v5, v1, s13 -; GFX10-NEXT: v_mul_lo_u32 v6, v2, s14 +; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, s1, v1 +; GFX10-NEXT: v_mul_hi_u32 v2, s2, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, s3, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v5, v1, s5 +; GFX10-NEXT: v_mul_lo_u32 v6, v2, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v0 -; GFX10-NEXT: v_mul_lo_u32 v7, v3, s15 +; GFX10-NEXT: v_mul_lo_u32 v7, v3, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s8, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, s9, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s10, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, s11, v7 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v5 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v6 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v7 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s0, v4 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s1, v5 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s2, v6 +; GFX10-NEXT: v_sub_nc_u32_e32 v7, s3, v7 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v4 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s5, v5 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s6, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s7, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s12, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s4, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s13, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s5, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s14, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s6, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v12, s15, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v12, s7, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s1 @@ -949,25 +949,26 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v12, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v5 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v6 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v4 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s5, v5 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s6, v6 ; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v7 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s7, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s12, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s4, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s13, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s5, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s14, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v13, s15, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s6, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v13, s7, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s2 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[10:11] ; GFX10-NEXT: s_endpgm %div = udiv <4 x i32> %x, %y store <4 x i32> %div, ptr addrspace(1) %out0 @@ -979,8 +980,8 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: udivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1248,13 +1249,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 +; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s17 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s16 -; GFX9-NEXT: s_sub_u32 s2, 0, s16 -; GFX9-NEXT: s_subb_u32 s3, 0, s17 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX9-NEXT: s_sub_u32 s2, 0, s4 +; GFX9-NEXT: s_subb_u32 s3, 0, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -1293,12 +1294,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: s_sub_u32 s2, 0, s18 +; GFX9-NEXT: s_sub_u32 s2, 0, s6 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: s_subb_u32 s3, 0, s19 +; GFX9-NEXT: s_subb_u32 s3, 0, s7 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 @@ -1317,47 +1318,47 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 +; GFX9-NEXT: v_mul_hi_u32 v5, s17, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, s17 +; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v8, 0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v9, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v9, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s17, v8, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s12, v1 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v8, v[2:3] +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s16, v1 ; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v1 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s17, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v10, v4, v5, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s19 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s7 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v6, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s18 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s6 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s16, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s4, v2 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc @@ -1369,13 +1370,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v12 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v12 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[0:1] ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5] ; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13 @@ -1384,7 +1385,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v6, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 ; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4 -; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s16, v11 +; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s4, v11 ; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_hi_u32 v5, v16, v3 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 @@ -1440,55 +1441,55 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v16, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s15, v5 -; GFX9-NEXT: v_mul_lo_u32 v9, s14, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, s19, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, s18, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v7, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v2, s14, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, s15, v5 +; GFX9-NEXT: v_mul_hi_u32 v2, s18, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, s19, v5 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s15, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, s19, v6 ; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 -; GFX9-NEXT: v_mul_hi_u32 v9, s14, v6 -; GFX9-NEXT: v_mul_hi_u32 v13, s15, v6 +; GFX9-NEXT: v_mul_hi_u32 v9, s18, v6 +; GFX9-NEXT: v_mul_hi_u32 v13, s19, v6 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v5, v2 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s18, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s6, v12, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v10, s[0:1] ; GFX9-NEXT: v_add_u32_e32 v1, v11, v9 ; GFX9-NEXT: v_add3_u32 v9, v1, v2, v13 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v9, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v10, s15 -; GFX9-NEXT: v_mov_b32_e32 v6, s19 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v12, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s14, v5 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v9, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v10, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v12, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s18, v5 ; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v10 -; GFX9-NEXT: v_sub_u32_e32 v1, s15, v1 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v10 +; GFX9-NEXT: v_sub_u32_e32 v1, s19, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v10 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v10 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s18, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s6, v2 ; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12 ; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v13 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v11 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v13 -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s18, v11 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v13 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s6, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc @@ -1503,24 +1504,24 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[0:1] -; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[8:9] -; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[10:11] +; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[12:13] +; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x20 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 +; GFX10-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s17 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s19 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s16 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s18 -; GFX10-NEXT: s_sub_u32 s0, 0, s16 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX10-NEXT: s_sub_u32 s0, 0, s4 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX10-NEXT: s_subb_u32 s1, 0, s17 +; GFX10-NEXT: s_subb_u32 s1, 0, s5 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -1540,16 +1541,16 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v7, 0 -; GFX10-NEXT: s_sub_u32 s2, 0, s18 +; GFX10-NEXT: s_sub_u32 s2, 0, s6 ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s3, s2, v8, 0 ; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s3, s0, v9, v[1:2] ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4] ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0 -; GFX10-NEXT: s_subb_u32 s3, 0, s19 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, s1, v7, v[4:5] +; GFX10-NEXT: s_subb_u32 s3, 0, s7 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s10, s1, v7, v[4:5] ; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s3, v8, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s10, s3, v8, v[5:6] ; GFX10-NEXT: v_mul_lo_u32 v1, v10, v2 ; GFX10-NEXT: v_mul_hi_u32 v5, v8, v2 ; GFX10-NEXT: v_mul_hi_u32 v2, v10, v2 @@ -1561,38 +1562,38 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 ; GFX10-NEXT: v_mul_hi_u32 v3, v9, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, v10, v0 -; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v2, s4, v16, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v6, s4, v11, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v5 +; GFX10-NEXT: v_add_co_u32 v6, s10, v6, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v11, s10, v13, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v1, s10, v1, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v2, s10, v16, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v4, s10, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v6, s10, v11, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v1, s10, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v4, s10, v6, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v15, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v16, v5 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v7, v4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v2, v1 +; GFX10-NEXT: v_add_co_u32 v1, s10, v2, v1 ; GFX10-NEXT: v_add3_u32 v3, v11, v6, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s10 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v3, vcc_lo ; GFX10-NEXT: v_add3_u32 v2, v5, v2, v0 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s0, v7, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s10, s0, v7, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, s2, v8, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s10, s2, v8, 0 ; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2] ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4] @@ -1641,20 +1642,20 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v9, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v8, v1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v10, v0, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v3, s13, v4 -; GFX10-NEXT: v_mul_lo_u32 v8, s12, v2 -; GFX10-NEXT: v_mul_hi_u32 v5, s12, v4 -; GFX10-NEXT: v_mul_hi_u32 v4, s13, v4 -; GFX10-NEXT: v_mul_lo_u32 v9, s13, v2 -; GFX10-NEXT: v_mul_lo_u32 v6, s15, v1 -; GFX10-NEXT: v_mul_hi_u32 v10, s12, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, s13, v2 -; GFX10-NEXT: v_mul_lo_u32 v2, s14, v0 -; GFX10-NEXT: v_mul_hi_u32 v7, s14, v1 -; GFX10-NEXT: v_mul_hi_u32 v1, s15, v1 -; GFX10-NEXT: v_mul_lo_u32 v12, s15, v0 -; GFX10-NEXT: v_mul_hi_u32 v13, s14, v0 -; GFX10-NEXT: v_mul_hi_u32 v14, s15, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, s17, v4 +; GFX10-NEXT: v_mul_lo_u32 v8, s16, v2 +; GFX10-NEXT: v_mul_hi_u32 v5, s16, v4 +; GFX10-NEXT: v_mul_hi_u32 v4, s17, v4 +; GFX10-NEXT: v_mul_lo_u32 v9, s17, v2 +; GFX10-NEXT: v_mul_lo_u32 v6, s19, v1 +; GFX10-NEXT: v_mul_hi_u32 v10, s16, v2 +; GFX10-NEXT: v_mul_hi_u32 v11, s17, v2 +; GFX10-NEXT: v_mul_lo_u32 v2, s18, v0 +; GFX10-NEXT: v_mul_hi_u32 v7, s18, v1 +; GFX10-NEXT: v_mul_hi_u32 v1, s19, v1 +; GFX10-NEXT: v_mul_lo_u32 v12, s19, v0 +; GFX10-NEXT: v_mul_hi_u32 v13, s18, v0 +; GFX10-NEXT: v_mul_hi_u32 v14, s19, v0 ; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4 @@ -1677,77 +1678,77 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_add_co_u32 v8, s0, v4, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v10, s0, v1, v2 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s16, v8, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s4, v8, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s18, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s6, v10, 0 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7 ; GFX10-NEXT: v_add3_u32 v9, v5, v4, v11 ; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v8, 1 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: v_add3_u32 v7, v7, v6, v14 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s16, v9, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s4, v9, v[1:2] ; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s18, v7, v[3:4] -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s17, v8, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s6, v7, v[3:4] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s5, v8, v[4:5] ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s12, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s19, v10, v[5:6] -; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s13, v3, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s16, v14 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s13, v3 +; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s16, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s7, v10, v[5:6] +; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s17, v3, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v14 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s17, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0 -; GFX10-NEXT: v_sub_co_u32 v15, s0, s14, v2 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s15, v0, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v15 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s15, v0 +; GFX10-NEXT: v_sub_co_u32 v15, s0, s18, v2 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s19, v0, s0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v15 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s16 +; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s4 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s17, v5 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s19, v0, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s17, v18 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v5 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s7, v0, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v18 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s16, v17 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v17 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s17, v18 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s19, v16 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v21, v20, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s17, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s16 +; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s4 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s18 +; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v0, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s19, v16 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s19, v12 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v6 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v10, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s19, v12 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s19, v23, s1 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s7, v23, s1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s18 +; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s6 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc_lo @@ -1758,8 +1759,8 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v6, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v8, s1 -; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[8:9] -; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[10:11] +; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[12:13] +; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[14:15] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 @@ -1771,7 +1772,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) { ; GFX8-LABEL: udiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 @@ -1781,7 +1782,7 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 @@ -1808,7 +1809,7 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80008 @@ -1819,7 +1820,7 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 @@ -1842,7 +1843,7 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX10-LABEL: udiv_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff @@ -1858,7 +1859,7 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo @@ -1883,8 +1884,8 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: udivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s0, s[8:9], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -1948,7 +1949,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80010 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 @@ -1964,26 +1965,26 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 ; GFX9-NEXT: s_sub_i32 s2, 0, s5 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX9-NEXT: s_and_b32 s8, s0, 0xff +; GFX9-NEXT: s_and_b32 s6, s0, 0xff ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, s6, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s4 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s5 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -2011,7 +2012,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX10-NEXT: s_bfe_u32 s1, s0, 0x80010 @@ -2029,7 +2030,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2080,7 +2081,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) { ; GFX8-LABEL: udiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s5, s4, 16 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 @@ -2090,7 +2091,7 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 @@ -2117,7 +2118,7 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 @@ -2128,7 +2129,7 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 @@ -2151,7 +2152,7 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX10-LABEL: udiv_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff @@ -2167,7 +2168,7 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo @@ -2192,8 +2193,8 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: udivrem_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s2, s1, 0xffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2257,7 +2258,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -2276,7 +2277,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2318,14 +2319,15 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s1, 0xffff ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2372,7 +2374,6 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] ; GFX10-NEXT: s_endpgm @@ -2386,7 +2387,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) { ; GFX8-LABEL: udivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 @@ -2396,7 +2397,7 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 @@ -2425,7 +2426,7 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX9-LABEL: udivrem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s4, s0, 0x30008 @@ -2436,7 +2437,7 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 @@ -2461,7 +2462,7 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX10-LABEL: udivrem_i3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s4, s0, 0x30008 ; GFX10-NEXT: s_and_b32 s0, s0, 7 @@ -2477,7 +2478,7 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo @@ -2504,7 +2505,7 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) { ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 @@ -2514,7 +2515,7 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 @@ -2543,7 +2544,7 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i27: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s1, 0x7ffffff @@ -2554,7 +2555,7 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 @@ -2579,7 +2580,7 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i27: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s4, s1, 0x7ffffff ; GFX10-NEXT: s_and_b32 s0, s0, 0x7ffffff @@ -2595,7 +2596,7 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll index 386e34f72ab734..9c2fabce4bcdeb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -4,13 +4,13 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v3i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v4, 8 ; GFX906-NEXT: v_mov_b32_e32 v5, 16 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v3, v2, s[4:5] +; GFX906-NEXT: global_load_dword v3, v2, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v1, 0xff ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -18,17 +18,17 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX906-NEXT: v_or3_b32 v3, v6, v7, v3 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dword v0, v2, s[6:7] +; GFX906-NEXT: global_load_dword v0, v2, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX906-NEXT: v_or3_b32 v3, v2, v3, v0 ; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 @@ -38,8 +38,8 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: global_store_short v1, v0, s[0:1] -; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[0:1] offset:2 +; GFX906-NEXT: global_store_short v1, v0, s[6:7] +; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[6:7] offset:2 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -61,21 +61,21 @@ bb.2: define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v4i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v1, v2, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: global_load_dword v1, v2, s[0:1] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dword v1, v2, s[6:7] +; GFX906-NEXT: global_load_dword v1, v2, s[2:3] ; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: global_store_dword v0, v1, s[6:7] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -97,30 +97,30 @@ bb.2: define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v5i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX906-NEXT: global_store_byte v4, v1, s[0:1] -; GFX906-NEXT: global_store_byte v4, v0, s[0:1] offset:1 -; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[0:1] offset:2 -; GFX906-NEXT: global_store_byte v4, v3, s[0:1] offset:3 -; GFX906-NEXT: global_store_byte v4, v2, s[0:1] offset:4 +; GFX906-NEXT: global_store_byte v4, v1, s[6:7] +; GFX906-NEXT: global_store_byte v4, v0, s[6:7] offset:1 +; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[6:7] offset:2 +; GFX906-NEXT: global_store_byte v4, v3, s[6:7] offset:3 +; GFX906-NEXT: global_store_byte v4, v2, s[6:7] offset:4 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -142,21 +142,21 @@ bb.2: define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v8i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3] ; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -178,21 +178,21 @@ bb.2: define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v16i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[0:1] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7] +; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[2:3] ; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[6:7] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -214,25 +214,25 @@ bb.2: define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v9, 5, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5] -; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] offset:16 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[0:1] +; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[0:1] offset:16 +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7] -; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[2:3] +; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[2:3] offset:16 ; GFX906-NEXT: .LBB5_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 +; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[6:7] offset:16 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -254,16 +254,16 @@ bb.2: define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v256i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] +; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] ; GFX906-NEXT: s_mov_b32 s14, -1 ; GFX906-NEXT: s_mov_b32 s15, 0xe00000 -; GFX906-NEXT: s_add_u32 s12, s12, s9 +; GFX906-NEXT: s_add_u32 s12, s12, s11 ; GFX906-NEXT: s_addc_u32 s13, s13, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt vmcnt(0) @@ -272,49 +272,49 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16 ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[4:5] offset:32 -; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[4:5] offset:48 -; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[4:5] offset:64 -; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[4:5] offset:80 -; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[4:5] offset:96 -; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[4:5] offset:112 -; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[4:5] offset:128 -; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[4:5] offset:144 -; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[4:5] offset:160 -; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[4:5] offset:176 -; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[4:5] offset:192 -; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[4:5] offset:208 -; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[4:5] offset:224 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] offset:240 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64 +; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96 +; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[0:1] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[0:1] offset:128 +; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[0:1] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[0:1] offset:160 +; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[0:1] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[0:1] offset:192 +; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240 +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[6:7] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[6:7] offset:32 -; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[6:7] offset:48 -; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[6:7] offset:64 -; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[6:7] offset:80 -; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[6:7] offset:96 -; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[6:7] offset:112 -; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[6:7] offset:128 -; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[6:7] offset:144 -; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[6:7] offset:160 -; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[6:7] offset:176 -; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[6:7] offset:192 -; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[6:7] offset:208 -; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[6:7] offset:224 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] offset:240 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[2:3] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[2:3] offset:32 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[2:3] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[2:3] offset:64 +; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[2:3] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[2:3] offset:96 +; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[2:3] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[2:3] offset:128 +; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[2:3] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[2:3] offset:160 +; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[2:3] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[2:3] offset:192 +; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[2:3] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[2:3] offset:224 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:240 ; GFX906-NEXT: .LBB6_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GFX906-NEXT: s_nop 0 @@ -383,28 +383,28 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[0:1] -; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[0:1] offset:16 -; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[0:1] offset:32 -; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[0:1] offset:48 -; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[0:1] offset:64 -; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[0:1] offset:80 -; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[0:1] offset:96 -; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112 -; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[0:1] offset:128 -; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[0:1] offset:144 -; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[0:1] offset:160 -; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[0:1] offset:176 -; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[0:1] offset:192 -; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[0:1] offset:208 -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:224 +; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[6:7] +; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[6:7] offset:16 +; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[6:7] offset:32 +; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[6:7] offset:48 +; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[6:7] offset:64 +; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[6:7] offset:80 +; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[6:7] offset:96 +; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[6:7] offset:112 +; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[6:7] offset:128 +; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[6:7] offset:144 +; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[6:7] offset:160 +; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[6:7] offset:176 +; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[6:7] offset:192 +; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[6:7] offset:208 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:224 ; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:240 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -427,26 +427,26 @@ bb.2: define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: repeat_successor: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX906-NEXT: s_load_dword s6, s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_cmp_lt_i32 s0, 3 +; GFX906-NEXT: s_cmp_lt_i32 s6, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX906-NEXT: ; %bb.1: ; %LeafBlock -; GFX906-NEXT: s_cmp_ge_i32 s0, 1 +; GFX906-NEXT: s_cmp_ge_i32 s6, 1 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 ; GFX906-NEXT: ; %bb.2: ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX906-NEXT: global_load_dword v0, v0, s[4:5] +; GFX906-NEXT: global_load_dword v0, v0, s[0:1] ; GFX906-NEXT: s_branch .LBB7_5 ; GFX906-NEXT: .LBB7_3: ; %LeafBlock5 -; GFX906-NEXT: s_cmp_eq_u32 s0, 3 +; GFX906-NEXT: s_cmp_eq_u32 s6, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 ; GFX906-NEXT: ; %bb.4: ; %sw.bb5 ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX906-NEXT: global_load_dword v0, v0, s[6:7] +; GFX906-NEXT: global_load_dword v0, v0, s[2:3] ; GFX906-NEXT: .LBB7_5: ; %return.sink.split -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_store_dword v1, v0, s[0:1] @@ -479,16 +479,16 @@ return: define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_chain: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[8:9] ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB8_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[10:11] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc @@ -500,12 +500,12 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: ; %bb.3: ; %bb.2 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13] ; GFX906-NEXT: .LBB8_4: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -533,31 +533,31 @@ bb.3: define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_multi_block: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[4:5] +; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, v3 ; GFX906-NEXT: v_mov_b32_e32 v2, v4 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB9_4 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB9_3 ; GFX906-NEXT: ; %bb.2: ; %bb.2 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9] +; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] ; GFX906-NEXT: .LBB9_3: ; %Flow ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: .LBB9_4: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -584,7 +584,7 @@ bb.3: define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_loop_carried: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xff @@ -599,16 +599,16 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp ; GFX906-NEXT: .LBB10_1: ; %bb.1 ; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc +; GFX906-NEXT: s_and_b64 s[2:3], exec, vcc ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX906-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GFX906-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX906-NEXT: v_or3_b32 v1, v0, v3, v1 ; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_cbranch_execnz .LBB10_1 ; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll index f36dcb487e915f..9cd85553eb7b61 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 { ; GFX8-LABEL: constant_load_i8_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -18,24 +18,24 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a ; ; GFX9-LABEL: constant_load_i8_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_byte v1, v0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_load_i8_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %ld = load i8, ptr addrspace(4) %in, align 4 store i8 %ld, ptr addrspace(1) %out, align 4 @@ -45,7 +45,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 { ; GFX8-LABEL: constant_load_i16_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -57,24 +57,24 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr ; ; GFX9-LABEL: constant_load_i16_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_short v1, v0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_load_i16_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: global_store_short v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_short v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %ld = load i16, ptr addrspace(4) %in, align 4 store i16 %ld, ptr addrspace(1) %out, align 4 @@ -84,7 +84,7 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: sextload_i8_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -97,26 +97,26 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: sextload_i8_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i8 s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_sext_i32_i8 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sextload_i8_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_sext_i32_i8 s2, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %in, align 4 %sext = sext i8 %load to i32 @@ -127,7 +127,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: sextload_i16_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -140,26 +140,26 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: sextload_i16_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sextload_i16_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_sext_i32_i16 s2, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %load = load i16, ptr addrspace(1) %in, align 4 %sext = sext i16 %load to i32 @@ -170,7 +170,7 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: zextload_i8_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -183,26 +183,26 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: zextload_i8_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zextload_i8_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %in, align 4 %zext = zext i8 %load to i32 @@ -213,7 +213,7 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: zextload_i16_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -226,26 +226,26 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: zextload_i16_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zextload_i16_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %load = load i16, ptr addrspace(1) %in, align 4 %zext = zext i16 %load to i32 @@ -256,7 +256,7 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_load_i8_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -269,22 +269,22 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: constant_load_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_byte v0, v1, s[4:5] +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_load_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %in, align 2 store i8 %load, ptr addrspace(1) %out, align 2 @@ -294,7 +294,7 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_load_i16_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -307,22 +307,22 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: constant_load_i16_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_load_i16_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %load = load i16, ptr addrspace(1) %in, align 2 store i16 %load, ptr addrspace(1) %out, align 2 @@ -332,7 +332,7 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_sextload_i8_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -351,24 +351,24 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: constant_sextload_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_sbyte v1, v0, s[6:7] +; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v0, v1, s[4:5] -; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_sextload_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_sbyte v1, v0, s[6:7] +; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_short v0, v1, s[4:5] -; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2 +; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2 ; GFX10-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %in, align 2 %sextload = sext i8 %load to i32 @@ -379,7 +379,7 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_zextload_i8_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -398,24 +398,24 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: constant_zextload_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v0, v1, s[4:5] -; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: constant_zextload_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_short v0, v1, s[4:5] -; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2 +; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2 ; GFX10-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %in, align 2 %zextload = zext i8 %load to i32 diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index 3c9d43a88a0fda..033af692438015 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: s_add_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -22,7 +22,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX8-LABEL: s_add_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -35,31 +35,31 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: s_add_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_add_i32 s2, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_add_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_add_i32 s2, s4, s5 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_add_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -71,7 +71,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: s_add_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -91,7 +91,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: s_add_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -106,7 +106,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: s_add_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -121,35 +121,35 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_add_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_add_i32 s2, s5, s7 +; GFX9-NEXT: s_add_i32 s3, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_add_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_add_i32 s2, s4, s6 +; GFX10-NEXT: s_add_i32 s3, s5, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_add_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -163,7 +163,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: s_add_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -185,7 +185,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: s_add_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 @@ -204,7 +204,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: s_add_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -223,7 +223,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_add_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -241,7 +241,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: s_add_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -259,7 +259,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: s_add_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -276,7 +276,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: s_add_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -301,85 +301,85 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) { ; GFX6-LABEL: s_add_v8i32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x11 -; GFX6-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s23, 0xf000 -; GFX6-NEXT: s_mov_b32 s22, -1 +; GFX6-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s7, s15 -; GFX6-NEXT: s_add_i32 s1, s6, s14 -; GFX6-NEXT: s_add_i32 s2, s5, s13 -; GFX6-NEXT: s_add_i32 s3, s4, s12 ; GFX6-NEXT: s_add_i32 s4, s11, s19 ; GFX6-NEXT: s_add_i32 s5, s10, s18 ; GFX6-NEXT: s_add_i32 s6, s9, s17 ; GFX6-NEXT: s_add_i32 s7, s8, s16 +; GFX6-NEXT: s_add_i32 s8, s15, s23 +; GFX6-NEXT: s_add_i32 s9, s14, s22 +; GFX6-NEXT: s_add_i32 s10, s13, s21 +; GFX6-NEXT: s_add_i32 s11, s12, s20 +; GFX6-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s3 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_add_v8i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_i32 s7, s7, s15 -; GFX8-NEXT: s_add_i32 s6, s6, s14 -; GFX8-NEXT: s_add_i32 s5, s5, s13 -; GFX8-NEXT: s_add_i32 s4, s4, s12 -; GFX8-NEXT: s_add_i32 s2, s11, s19 -; GFX8-NEXT: s_add_i32 s3, s10, s18 -; GFX8-NEXT: s_add_i32 s9, s9, s17 -; GFX8-NEXT: s_add_i32 s8, s8, s16 +; GFX8-NEXT: s_add_i32 s4, s11, s19 +; GFX8-NEXT: s_add_i32 s5, s10, s18 +; GFX8-NEXT: s_add_i32 s6, s9, s17 +; GFX8-NEXT: s_add_i32 s7, s8, s16 +; GFX8-NEXT: s_add_i32 s2, s15, s23 +; GFX8-NEXT: s_add_i32 s3, s14, s22 +; GFX8-NEXT: s_add_i32 s8, s13, s21 +; GFX8-NEXT: s_add_i32 s9, s12, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v0, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_add_v8i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s2, s7, s15 -; GFX9-NEXT: s_add_i32 s3, s6, s14 -; GFX9-NEXT: s_add_i32 s6, s11, s19 -; GFX9-NEXT: s_add_i32 s7, s10, s18 -; GFX9-NEXT: s_add_i32 s9, s9, s17 -; GFX9-NEXT: s_add_i32 s8, s8, s16 -; GFX9-NEXT: s_add_i32 s5, s5, s13 -; GFX9-NEXT: s_add_i32 s4, s4, s12 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_add_i32 s4, s9, s17 +; GFX9-NEXT: s_add_i32 s5, s8, s16 +; GFX9-NEXT: s_add_i32 s6, s15, s23 +; GFX9-NEXT: s_add_i32 s7, s14, s22 +; GFX9-NEXT: s_add_i32 s8, s13, s21 +; GFX9-NEXT: s_add_i32 s9, s12, s20 +; GFX9-NEXT: s_add_i32 s2, s11, s19 +; GFX9-NEXT: s_add_i32 s3, s10, s18 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] @@ -388,24 +388,24 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX10-LABEL: s_add_v8i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s2, s7, s15 -; GFX10-NEXT: s_add_i32 s3, s6, s14 -; GFX10-NEXT: s_add_i32 s6, s11, s19 -; GFX10-NEXT: s_add_i32 s7, s10, s18 -; GFX10-NEXT: s_add_i32 s8, s8, s16 -; GFX10-NEXT: s_add_i32 s9, s9, s17 -; GFX10-NEXT: s_add_i32 s5, s5, s13 -; GFX10-NEXT: s_add_i32 s4, s4, s12 +; GFX10-NEXT: s_add_i32 s4, s9, s17 +; GFX10-NEXT: s_add_i32 s5, s8, s16 +; GFX10-NEXT: s_add_i32 s6, s15, s23 +; GFX10-NEXT: s_add_i32 s7, s14, s22 +; GFX10-NEXT: s_add_i32 s8, s12, s20 +; GFX10-NEXT: s_add_i32 s9, s13, s21 +; GFX10-NEXT: s_add_i32 s2, s11, s19 +; GFX10-NEXT: s_add_i32 s3, s10, s18 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: v_mov_b32_e32 v6, s3 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 @@ -415,21 +415,21 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX11-LABEL: s_add_v8i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s2, s7, s15 -; GFX11-NEXT: s_add_i32 s3, s6, s14 -; GFX11-NEXT: s_add_i32 s6, s11, s19 -; GFX11-NEXT: s_add_i32 s7, s10, s18 -; GFX11-NEXT: s_add_i32 s8, s8, s16 -; GFX11-NEXT: s_add_i32 s9, s9, s17 -; GFX11-NEXT: s_add_i32 s5, s5, s13 -; GFX11-NEXT: s_add_i32 s4, s4, s12 +; GFX11-NEXT: s_add_i32 s4, s9, s17 +; GFX11-NEXT: s_add_i32 s5, s8, s16 +; GFX11-NEXT: s_add_i32 s6, s15, s23 +; GFX11-NEXT: s_add_i32 s7, s14, s22 +; GFX11-NEXT: s_add_i32 s8, s12, s20 +; GFX11-NEXT: s_add_i32 s9, s13, s21 +; GFX11-NEXT: s_add_i32 s2, s11, s19 +; GFX11-NEXT: s_add_i32 s3, s10, s18 ; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9 ; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6 -; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5 -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v7, s2 ; GFX11-NEXT: v_mov_b32_e32 v6, s3 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 @@ -439,21 +439,21 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX12-LABEL: s_add_v8i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s2, s7, s15 -; GFX12-NEXT: s_add_co_i32 s3, s6, s14 -; GFX12-NEXT: s_add_co_i32 s6, s11, s19 -; GFX12-NEXT: s_add_co_i32 s7, s10, s18 -; GFX12-NEXT: s_add_co_i32 s8, s8, s16 -; GFX12-NEXT: s_add_co_i32 s9, s9, s17 -; GFX12-NEXT: s_add_co_i32 s5, s5, s13 -; GFX12-NEXT: s_add_co_i32 s4, s4, s12 +; GFX12-NEXT: s_add_co_i32 s4, s9, s17 +; GFX12-NEXT: s_add_co_i32 s5, s8, s16 +; GFX12-NEXT: s_add_co_i32 s6, s15, s23 +; GFX12-NEXT: s_add_co_i32 s7, s14, s22 +; GFX12-NEXT: s_add_co_i32 s8, s12, s20 +; GFX12-NEXT: s_add_co_i32 s9, s13, s21 +; GFX12-NEXT: s_add_co_i32 s2, s11, s19 +; GFX12-NEXT: s_add_co_i32 s3, s10, s18 ; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6 -; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5 -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2 +; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s4 +; GFX12-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v7, s2 ; GFX12-NEXT: v_mov_b32_e32 v6, s3 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 @@ -468,75 +468,75 @@ entry: define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <16 x i32> %b) { ; GFX6-LABEL: s_add_v16i32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; GFX6-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x29 -; GFX6-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s23, 0xf000 -; GFX6-NEXT: s_mov_b32 s22, -1 +; GFX6-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; GFX6-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x29 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s7, s39 -; GFX6-NEXT: s_add_i32 s1, s6, s38 -; GFX6-NEXT: s_add_i32 s2, s5, s37 -; GFX6-NEXT: s_add_i32 s3, s4, s36 -; GFX6-NEXT: s_add_i32 s4, s11, s43 -; GFX6-NEXT: s_add_i32 s5, s10, s42 -; GFX6-NEXT: s_add_i32 s6, s9, s41 -; GFX6-NEXT: s_add_i32 s7, s8, s40 -; GFX6-NEXT: s_add_i32 s8, s15, s47 -; GFX6-NEXT: s_add_i32 s9, s14, s46 -; GFX6-NEXT: s_add_i32 s10, s13, s45 -; GFX6-NEXT: s_add_i32 s11, s12, s44 -; GFX6-NEXT: s_add_i32 s12, s19, s51 -; GFX6-NEXT: s_add_i32 s13, s18, s50 -; GFX6-NEXT: s_add_i32 s14, s17, s49 -; GFX6-NEXT: s_add_i32 s15, s16, s48 +; GFX6-NEXT: s_add_i32 s4, s11, s39 +; GFX6-NEXT: s_add_i32 s5, s10, s38 +; GFX6-NEXT: s_add_i32 s6, s9, s37 +; GFX6-NEXT: s_add_i32 s7, s8, s36 +; GFX6-NEXT: s_add_i32 s8, s15, s43 +; GFX6-NEXT: s_add_i32 s9, s14, s42 +; GFX6-NEXT: s_add_i32 s10, s13, s41 +; GFX6-NEXT: s_add_i32 s11, s12, s40 +; GFX6-NEXT: s_add_i32 s12, s19, s47 +; GFX6-NEXT: s_add_i32 s13, s18, s46 +; GFX6-NEXT: s_add_i32 s14, s17, s45 +; GFX6-NEXT: s_add_i32 s15, s16, s44 +; GFX6-NEXT: s_add_i32 s16, s23, s51 +; GFX6-NEXT: s_add_i32 s17, s22, s50 +; GFX6-NEXT: s_add_i32 s18, s21, s49 +; GFX6-NEXT: s_add_i32 s19, s20, s48 +; GFX6-NEXT: v_mov_b32_e32 v0, s19 +; GFX6-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NEXT: v_mov_b32_e32 v2, s17 +; GFX6-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s15 ; GFX6-NEXT: v_mov_b32_e32 v1, s14 ; GFX6-NEXT: v_mov_b32_e32 v2, s13 ; GFX6-NEXT: v_mov_b32_e32 v3, s12 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s11 ; GFX6-NEXT: v_mov_b32_e32 v1, s10 ; GFX6-NEXT: v_mov_b32_e32 v2, s9 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s3 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_add_v16i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX8-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX8-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_i32 s7, s7, s39 -; GFX8-NEXT: s_add_i32 s6, s6, s38 -; GFX8-NEXT: s_add_i32 s5, s5, s37 -; GFX8-NEXT: s_add_i32 s4, s4, s36 -; GFX8-NEXT: s_add_i32 s11, s11, s43 -; GFX8-NEXT: s_add_i32 s10, s10, s42 -; GFX8-NEXT: s_add_i32 s9, s9, s41 -; GFX8-NEXT: s_add_i32 s8, s8, s40 -; GFX8-NEXT: s_add_i32 s15, s15, s47 -; GFX8-NEXT: s_add_i32 s14, s14, s46 -; GFX8-NEXT: s_add_i32 s13, s13, s45 -; GFX8-NEXT: s_add_i32 s12, s12, s44 -; GFX8-NEXT: s_add_i32 s2, s19, s51 -; GFX8-NEXT: s_add_i32 s3, s18, s50 -; GFX8-NEXT: s_add_i32 s17, s17, s49 -; GFX8-NEXT: s_add_i32 s16, s16, s48 +; GFX8-NEXT: s_add_i32 s4, s11, s39 +; GFX8-NEXT: s_add_i32 s5, s10, s38 +; GFX8-NEXT: s_add_i32 s6, s9, s37 +; GFX8-NEXT: s_add_i32 s7, s8, s36 +; GFX8-NEXT: s_add_i32 s8, s15, s43 +; GFX8-NEXT: s_add_i32 s9, s14, s42 +; GFX8-NEXT: s_add_i32 s10, s13, s41 +; GFX8-NEXT: s_add_i32 s11, s12, s40 +; GFX8-NEXT: s_add_i32 s12, s19, s47 +; GFX8-NEXT: s_add_i32 s13, s18, s46 +; GFX8-NEXT: s_add_i32 s14, s17, s45 +; GFX8-NEXT: s_add_i32 s15, s16, s44 +; GFX8-NEXT: s_add_i32 s2, s23, s51 +; GFX8-NEXT: s_add_i32 s3, s22, s50 +; GFX8-NEXT: s_add_i32 s16, s21, s49 +; GFX8-NEXT: s_add_i32 s17, s20, s48 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 @@ -544,77 +544,78 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v0, s17 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_mov_b32_e32 v3, s12 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v0, s11 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_add_v16i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s2, s7, s39 -; GFX9-NEXT: s_add_i32 s3, s6, s38 -; GFX9-NEXT: s_add_i32 s6, s11, s43 -; GFX9-NEXT: s_add_i32 s7, s10, s42 -; GFX9-NEXT: s_add_i32 s10, s15, s47 -; GFX9-NEXT: s_add_i32 s11, s14, s46 -; GFX9-NEXT: s_add_i32 s14, s19, s51 -; GFX9-NEXT: s_add_i32 s15, s18, s50 -; GFX9-NEXT: s_add_i32 s17, s17, s49 -; GFX9-NEXT: s_add_i32 s16, s16, s48 -; GFX9-NEXT: s_add_i32 s13, s13, s45 -; GFX9-NEXT: s_add_i32 s12, s12, s44 -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_add_i32 s4, s9, s37 +; GFX9-NEXT: s_add_i32 s5, s8, s36 +; GFX9-NEXT: s_add_i32 s6, s15, s43 +; GFX9-NEXT: s_add_i32 s7, s14, s42 +; GFX9-NEXT: s_add_i32 s8, s13, s41 +; GFX9-NEXT: s_add_i32 s9, s12, s40 +; GFX9-NEXT: s_add_i32 s12, s17, s45 +; GFX9-NEXT: s_add_i32 s13, s16, s44 +; GFX9-NEXT: s_add_i32 s14, s23, s51 +; GFX9-NEXT: s_add_i32 s15, s22, s50 +; GFX9-NEXT: s_add_i32 s16, s21, s49 +; GFX9-NEXT: s_add_i32 s17, s20, s48 +; GFX9-NEXT: s_add_i32 s2, s11, s39 +; GFX9-NEXT: s_add_i32 s3, s10, s38 +; GFX9-NEXT: s_add_i32 s10, s19, s47 +; GFX9-NEXT: s_add_i32 s11, s18, s46 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: v_mov_b32_e32 v2, s15 ; GFX9-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-NEXT: s_add_i32 s9, s9, s41 -; GFX9-NEXT: s_add_i32 s8, s8, s40 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 -; GFX9-NEXT: s_add_i32 s5, s5, s37 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: s_add_i32 s4, s4, s36 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] @@ -623,41 +624,41 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX10-LABEL: s_add_v16i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s2, s7, s39 -; GFX10-NEXT: s_add_i32 s3, s6, s38 -; GFX10-NEXT: s_add_i32 s6, s11, s43 -; GFX10-NEXT: s_add_i32 s7, s10, s42 -; GFX10-NEXT: s_add_i32 s10, s15, s47 -; GFX10-NEXT: s_add_i32 s11, s14, s46 -; GFX10-NEXT: s_add_i32 s14, s19, s51 -; GFX10-NEXT: s_add_i32 s15, s18, s50 -; GFX10-NEXT: s_add_i32 s16, s16, s48 -; GFX10-NEXT: s_add_i32 s17, s17, s49 -; GFX10-NEXT: s_add_i32 s13, s13, s45 -; GFX10-NEXT: s_add_i32 s12, s12, s44 -; GFX10-NEXT: s_add_i32 s9, s9, s41 -; GFX10-NEXT: s_add_i32 s8, s8, s40 +; GFX10-NEXT: s_add_i32 s4, s9, s37 +; GFX10-NEXT: s_add_i32 s5, s8, s36 +; GFX10-NEXT: s_add_i32 s6, s15, s43 +; GFX10-NEXT: s_add_i32 s7, s14, s42 +; GFX10-NEXT: s_add_i32 s8, s13, s41 +; GFX10-NEXT: s_add_i32 s9, s12, s40 +; GFX10-NEXT: s_add_i32 s12, s17, s45 +; GFX10-NEXT: s_add_i32 s13, s16, s44 +; GFX10-NEXT: s_add_i32 s14, s23, s51 +; GFX10-NEXT: s_add_i32 s15, s22, s50 +; GFX10-NEXT: s_add_i32 s16, s20, s48 +; GFX10-NEXT: s_add_i32 s17, s21, s49 +; GFX10-NEXT: s_add_i32 s2, s11, s39 +; GFX10-NEXT: s_add_i32 s3, s10, s38 +; GFX10-NEXT: s_add_i32 s10, s19, s47 +; GFX10-NEXT: s_add_i32 s11, s18, s46 ; GFX10-NEXT: v_mov_b32_e32 v0, s16 ; GFX10-NEXT: v_mov_b32_e32 v1, s17 ; GFX10-NEXT: v_mov_b32_e32 v2, s15 ; GFX10-NEXT: v_mov_b32_e32 v3, s14 -; GFX10-NEXT: s_add_i32 s5, s5, s37 -; GFX10-NEXT: s_add_i32 s4, s4, s36 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 +; GFX10-NEXT: v_mov_b32_e32 v4, s13 +; GFX10-NEXT: v_mov_b32_e32 v5, s12 ; GFX10-NEXT: v_mov_b32_e32 v6, s11 ; GFX10-NEXT: v_mov_b32_e32 v7, s10 -; GFX10-NEXT: v_mov_b32_e32 v8, s8 -; GFX10-NEXT: v_mov_b32_e32 v9, s9 +; GFX10-NEXT: v_mov_b32_e32 v8, s9 +; GFX10-NEXT: v_mov_b32_e32 v9, s8 ; GFX10-NEXT: v_mov_b32_e32 v10, s7 ; GFX10-NEXT: v_mov_b32_e32 v11, s6 -; GFX10-NEXT: v_mov_b32_e32 v12, s4 -; GFX10-NEXT: v_mov_b32_e32 v13, s5 +; GFX10-NEXT: v_mov_b32_e32 v12, s5 +; GFX10-NEXT: v_mov_b32_e32 v13, s4 ; GFX10-NEXT: v_mov_b32_e32 v14, s3 ; GFX10-NEXT: v_mov_b32_e32 v15, s2 ; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48 @@ -669,34 +670,34 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX11-LABEL: s_add_v16i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 -; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0xa4 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x64 +; GFX11-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s2, s7, s39 -; GFX11-NEXT: s_add_i32 s3, s6, s38 -; GFX11-NEXT: s_add_i32 s6, s11, s43 -; GFX11-NEXT: s_add_i32 s7, s10, s42 -; GFX11-NEXT: s_add_i32 s10, s15, s47 -; GFX11-NEXT: s_add_i32 s11, s14, s46 -; GFX11-NEXT: s_add_i32 s14, s19, s51 -; GFX11-NEXT: s_add_i32 s15, s18, s50 -; GFX11-NEXT: s_add_i32 s16, s16, s48 -; GFX11-NEXT: s_add_i32 s17, s17, s49 -; GFX11-NEXT: s_add_i32 s13, s13, s45 -; GFX11-NEXT: s_add_i32 s12, s12, s44 +; GFX11-NEXT: s_add_i32 s4, s9, s37 +; GFX11-NEXT: s_add_i32 s5, s8, s36 +; GFX11-NEXT: s_add_i32 s6, s15, s43 +; GFX11-NEXT: s_add_i32 s7, s14, s42 +; GFX11-NEXT: s_add_i32 s8, s13, s41 +; GFX11-NEXT: s_add_i32 s9, s12, s40 +; GFX11-NEXT: s_add_i32 s12, s17, s45 +; GFX11-NEXT: s_add_i32 s13, s16, s44 +; GFX11-NEXT: s_add_i32 s14, s23, s51 +; GFX11-NEXT: s_add_i32 s15, s22, s50 +; GFX11-NEXT: s_add_i32 s16, s20, s48 +; GFX11-NEXT: s_add_i32 s17, s21, s49 +; GFX11-NEXT: s_add_i32 s2, s11, s39 +; GFX11-NEXT: s_add_i32 s3, s10, s38 +; GFX11-NEXT: s_add_i32 s10, s19, s47 +; GFX11-NEXT: s_add_i32 s11, s18, s46 ; GFX11-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17 -; GFX11-NEXT: s_add_i32 s9, s9, s41 -; GFX11-NEXT: s_add_i32 s8, s8, s40 ; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s14 -; GFX11-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: s_add_i32 s5, s5, s37 -; GFX11-NEXT: s_add_i32 s4, s4, s36 -; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s10 -; GFX11-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s9 -; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s6 -; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5 -; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s12 +; GFX11-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v7, s10 +; GFX11-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s8 +; GFX11-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6 +; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s4 +; GFX11-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v15, s2 ; GFX11-NEXT: v_mov_b32_e32 v14, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 @@ -708,34 +709,34 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX12-LABEL: s_add_v16i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 -; GFX12-NEXT: s_load_b512 s[36:51], s[2:3], 0xa4 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b512 s[8:23], s[4:5], 0x64 +; GFX12-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s2, s7, s39 -; GFX12-NEXT: s_add_co_i32 s3, s6, s38 -; GFX12-NEXT: s_add_co_i32 s6, s11, s43 -; GFX12-NEXT: s_add_co_i32 s7, s10, s42 -; GFX12-NEXT: s_add_co_i32 s10, s15, s47 -; GFX12-NEXT: s_add_co_i32 s11, s14, s46 -; GFX12-NEXT: s_add_co_i32 s14, s19, s51 -; GFX12-NEXT: s_add_co_i32 s15, s18, s50 -; GFX12-NEXT: s_add_co_i32 s16, s16, s48 -; GFX12-NEXT: s_add_co_i32 s17, s17, s49 -; GFX12-NEXT: s_add_co_i32 s13, s13, s45 -; GFX12-NEXT: s_add_co_i32 s12, s12, s44 +; GFX12-NEXT: s_add_co_i32 s4, s9, s37 +; GFX12-NEXT: s_add_co_i32 s5, s8, s36 +; GFX12-NEXT: s_add_co_i32 s6, s15, s43 +; GFX12-NEXT: s_add_co_i32 s7, s14, s42 +; GFX12-NEXT: s_add_co_i32 s8, s13, s41 +; GFX12-NEXT: s_add_co_i32 s9, s12, s40 +; GFX12-NEXT: s_add_co_i32 s12, s17, s45 +; GFX12-NEXT: s_add_co_i32 s13, s16, s44 +; GFX12-NEXT: s_add_co_i32 s14, s23, s51 +; GFX12-NEXT: s_add_co_i32 s15, s22, s50 +; GFX12-NEXT: s_add_co_i32 s16, s20, s48 +; GFX12-NEXT: s_add_co_i32 s17, s21, s49 +; GFX12-NEXT: s_add_co_i32 s2, s11, s39 +; GFX12-NEXT: s_add_co_i32 s3, s10, s38 +; GFX12-NEXT: s_add_co_i32 s10, s19, s47 +; GFX12-NEXT: s_add_co_i32 s11, s18, s46 ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17 -; GFX12-NEXT: s_add_co_i32 s9, s9, s41 -; GFX12-NEXT: s_add_co_i32 s8, s8, s40 ; GFX12-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s14 -; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s13 -; GFX12-NEXT: s_add_co_i32 s5, s5, s37 -; GFX12-NEXT: s_add_co_i32 s4, s4, s36 -; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s10 -; GFX12-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s9 -; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s6 -; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5 -; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2 +; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s12 +; GFX12-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v7, s10 +; GFX12-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s8 +; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6 +; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s4 +; GFX12-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v15, s2 ; GFX12-NEXT: v_mov_b32_e32 v14, s3 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 @@ -752,7 +753,7 @@ entry: define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_add_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -773,7 +774,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX8-LABEL: v_add_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -793,35 +794,35 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_add_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_add_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_add_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -837,7 +838,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: v_add_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -863,7 +864,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_add_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -882,7 +883,7 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: v_add_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -898,31 +899,31 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_add_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_add_imm_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_add_imm_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -935,7 +936,7 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: v_add_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -957,84 +958,84 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: add64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_add_u32 s4, s6, s8 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_addc_u32 s5, s7, s9 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_add_u32 s0, s2, s8 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_addc_u32 s1, s3, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: add64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_add_u32 s0, s6, s0 -; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_u32 s0, s2, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_addc_u32 s1, s3, s5 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_add_u32 s2, s2, s6 +; GFX9-NEXT: s_addc_u32 s3, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s6, s0 -; GFX10-NEXT: s_addc_u32 s1, s7, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_add_u32 s2, s2, s6 +; GFX10-NEXT: s_addc_u32 s3, s3, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: add64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_u32 s0, s6, s0 -; GFX11-NEXT: s_addc_u32 s1, s7, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_add_u32 s2, s2, s4 +; GFX11-NEXT: s_addc_u32 s3, s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: add64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: %add = add i64 %a, %b @@ -1049,14 +1050,14 @@ entry: define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr addrspace(1) %in) { ; GFX6-LABEL: add64_sgpr_vgpr: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_u32 s0, s2, s8 ; GFX6-NEXT: s_addc_u32 s1, s3, s9 @@ -1067,10 +1068,10 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; ; GFX8-LABEL: add64_sgpr_vgpr: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1083,42 +1084,42 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; ; GFX9-LABEL: add64_sgpr_vgpr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_add_u32 s2, s2, s4 +; GFX9-NEXT: s_addc_u32 s3, s3, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add64_sgpr_vgpr: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s6, s0 -; GFX10-NEXT: s_addc_u32 s1, s7, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_add_u32 s2, s2, s4 +; GFX10-NEXT: s_addc_u32 s3, s3, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: add64_sgpr_vgpr: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, s4 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 @@ -1130,10 +1131,10 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX12-LABEL: add64_sgpr_vgpr: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -1151,7 +1152,7 @@ entry: define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; GFX6-LABEL: add64_in_branch: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -1178,7 +1179,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add64_in_branch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -1204,24 +1205,24 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_add_u32 s0, s8, s10 -; GFX9-NEXT: s_addc_u32 s1, s9, s11 +; GFX9-NEXT: s_add_u32 s0, s12, s14 +; GFX9-NEXT: s_addc_u32 s1, s13, s15 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccnz .LBB9_3 ; GFX9-NEXT: .LBB9_2: ; %if -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 ; GFX9-NEXT: .LBB9_3: ; %endif ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB9_4: ; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1 @@ -1229,22 +1230,22 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: add64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX10-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_add_u32 s0, s8, s10 -; GFX10-NEXT: s_addc_u32 s1, s9, s11 +; GFX10-NEXT: s_add_u32 s0, s12, s14 +; GFX10-NEXT: s_addc_u32 s1, s13, s15 ; GFX10-NEXT: s_cbranch_execnz .LBB9_3 ; GFX10-NEXT: .LBB9_2: ; %if -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 ; GFX10-NEXT: .LBB9_3: ; %endif ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; GFX10-NEXT: .LBB9_4: ; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1 @@ -1252,7 +1253,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: add64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 @@ -1274,7 +1275,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: add64_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB9_4 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index b413e779dbaf19..f94ec392ee55cf 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -9,22 +9,22 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u16_e32 v3, v4, v2 ; VI-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -33,51 +33,51 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 -; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -93,65 +93,65 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; VI-LABEL: s_test_add_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_lshr_b32 s3, s0, 16 -; VI-NEXT: s_add_i32 s2, s2, s0 -; VI-NEXT: s_add_i32 s1, s1, s3 -; VI-NEXT: s_and_b32 s0, s2, 0xffff -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: s_add_i32 s2, s2, s3 +; VI-NEXT: s_add_i32 s0, s0, s1 +; VI-NEXT: s_and_b32 s1, s2, 0xffff +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_pk_add_u16 v1, s3, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_pk_add_u16 v1, s5, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_pk_add_u16 v1, s4, s5 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v1, s2, s0 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %a = load <2 x i16>, ptr addrspace(4) %in0 %b = load <2 x i16>, ptr addrspace(4) %in1 @@ -163,7 +163,7 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { ; VI-LABEL: s_test_add_self_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -182,29 +182,29 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: s_test_add_self_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v1, s0, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_pk_add_u16 v1, s2, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s0, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_pk_add_u16 v1, s2, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_self_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -222,7 +222,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; VI-LABEL: s_test_add_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: s_lshr_b32 s5, s3, 16 @@ -239,26 +239,26 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_test_add_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_pk_add_u16 v1, s6, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_add_u16 v1, s2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s6, s7 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 @@ -273,7 +273,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x1c8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -292,32 +292,32 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_add_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_mov_b32 s2, 0x1c8007b +; GFX9-NEXT: v_pk_add_u16 v0, v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -340,7 +340,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -359,32 +359,32 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_mov_b32 s2, 0xfc21fcb3 +; GFX9-NEXT: v_pk_add_u16 v0, v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_neg_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -406,7 +406,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -425,31 +425,31 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_test_add_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v0, -1 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v0, -1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_inline_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -471,7 +471,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -489,31 +489,31 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v0, 32 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v0, 32 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -536,7 +536,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f80 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -554,31 +554,31 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v0, 1.0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v0, 1.0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -601,22 +601,22 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_zext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v3, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u16_e32 v2, v4, v3 ; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -624,58 +624,58 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid @@ -693,23 +693,23 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_zext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v6, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: v_add_u16_e32 v0, v6, v2 ; VI-NEXT: v_add_u16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -718,60 +718,60 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v2, v3 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: global_store_dwordx4 v1, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 -; GFX11-NEXT: global_store_b128 v1, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b128 v1, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid @@ -789,22 +789,22 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_sext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_u16_e32 v2, v4, v2 ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 @@ -814,58 +814,58 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid @@ -883,20 +883,20 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_sext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_u16_e32 v0, v0, v1 @@ -909,13 +909,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 @@ -923,20 +923,20 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -944,22 +944,22 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -969,7 +969,7 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 4d26453e1a0d6d..823db84a053b8c 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -244,7 +244,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: no_agpr_no_reserve: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 @@ -302,7 +302,7 @@ define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: no_agpr_no_reserve: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -514,15 +514,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX908-NEXT: s_load_dword s9, s[6:7], 0x18 -; GFX908-NEXT: s_mov_b32 s8, 0 -; GFX908-NEXT: s_mov_b32 s7, s8 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX908-NEXT: s_load_dword s7, s[8:9], 0x18 +; GFX908-NEXT: s_mov_b32 s6, 0 +; GFX908-NEXT: s_mov_b32 s9, s6 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX908-NEXT: s_sub_i32 s6, 0, s3 -; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s9 +; GFX908-NEXT: s_sub_i32 s8, 0, s3 +; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s7 ; GFX908-NEXT: v_mov_b32_e32 v19, 0 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 @@ -530,24 +530,24 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: s_mul_i32 s6, s6, s10 -; GFX908-NEXT: s_mul_hi_u32 s6, s10, s6 -; GFX908-NEXT: s_add_i32 s10, s10, s6 -; GFX908-NEXT: s_mul_hi_u32 s6, s2, s10 -; GFX908-NEXT: s_mul_i32 s10, s6, s3 +; GFX908-NEXT: s_mul_i32 s8, s8, s10 +; GFX908-NEXT: s_mul_hi_u32 s8, s10, s8 +; GFX908-NEXT: s_add_i32 s10, s10, s8 +; GFX908-NEXT: s_mul_hi_u32 s8, s2, s10 +; GFX908-NEXT: s_mul_i32 s10, s8, s3 ; GFX908-NEXT: s_sub_i32 s2, s2, s10 -; GFX908-NEXT: s_add_i32 s11, s6, 1 +; GFX908-NEXT: s_add_i32 s11, s8, 1 ; GFX908-NEXT: s_sub_i32 s10, s2, s3 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s6, s11, s6 +; GFX908-NEXT: s_cselect_b32 s8, s11, s8 ; GFX908-NEXT: s_cselect_b32 s2, s10, s2 -; GFX908-NEXT: s_add_i32 s10, s6, 1 +; GFX908-NEXT: s_add_i32 s10, s8, 1 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s6, s10, s6 -; GFX908-NEXT: s_lshr_b32 s9, s9, 16 -; GFX908-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 -; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s9 +; GFX908-NEXT: s_cselect_b32 s8, s10, s8 +; GFX908-NEXT: s_lshr_b32 s7, s7, 16 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s7 ; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 +; GFX908-NEXT: s_lshl_b64 s[12:13], s[8:9], 5 ; GFX908-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 ; GFX908-NEXT: s_or_b32 s10, s10, 28 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -572,15 +572,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 -; GFX908-NEXT: s_mov_b32 s9, s8 +; GFX908-NEXT: s_mov_b32 s7, s6 ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: v_mov_b32_e32 v4, s6 ; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6 -; GFX908-NEXT: v_mov_b32_e32 v6, s8 -; GFX908-NEXT: v_mov_b32_e32 v8, s8 -; GFX908-NEXT: v_mov_b32_e32 v5, s9 -; GFX908-NEXT: v_mov_b32_e32 v7, s9 -; GFX908-NEXT: v_mov_b32_e32 v9, s9 +; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_mov_b32_e32 v9, s7 +; GFX908-NEXT: v_mov_b32_e32 v5, s7 +; GFX908-NEXT: v_mov_b32_e32 v7, s7 +; GFX908-NEXT: v_mov_b32_e32 v8, s6 ; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 ; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11] @@ -666,7 +666,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s4, s4, s6 +; GFX908-NEXT: s_add_u32 s4, s4, s8 ; GFX908-NEXT: s_addc_u32 s5, s5, 0 ; GFX908-NEXT: s_add_u32 s10, s10, s12 ; GFX908-NEXT: s_addc_u32 s11, s11, s13 @@ -678,39 +678,39 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX90A-NEXT: s_load_dword s9, s[6:7], 0x18 -; GFX90A-NEXT: s_mov_b32 s8, 0 -; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX90A-NEXT: s_load_dword s7, s[8:9], 0x18 +; GFX90A-NEXT: s_mov_b32 s6, 0 +; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s6, 0, s3 +; GFX90A-NEXT: s_sub_i32 s8, 0, s3 ; GFX90A-NEXT: v_mov_b32_e32 v19, 0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s9 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s7 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v3 -; GFX90A-NEXT: s_mul_i32 s6, s6, s10 -; GFX90A-NEXT: s_mul_hi_u32 s6, s10, s6 -; GFX90A-NEXT: s_add_i32 s10, s10, s6 -; GFX90A-NEXT: s_mul_hi_u32 s6, s2, s10 -; GFX90A-NEXT: s_mul_i32 s10, s6, s3 +; GFX90A-NEXT: s_mul_i32 s8, s8, s10 +; GFX90A-NEXT: s_mul_hi_u32 s8, s10, s8 +; GFX90A-NEXT: s_add_i32 s10, s10, s8 +; GFX90A-NEXT: s_mul_hi_u32 s8, s2, s10 +; GFX90A-NEXT: s_mul_i32 s10, s8, s3 ; GFX90A-NEXT: s_sub_i32 s2, s2, s10 -; GFX90A-NEXT: s_add_i32 s11, s6, 1 +; GFX90A-NEXT: s_add_i32 s11, s8, 1 ; GFX90A-NEXT: s_sub_i32 s10, s2, s3 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s6, s11, s6 +; GFX90A-NEXT: s_cselect_b32 s8, s11, s8 ; GFX90A-NEXT: s_cselect_b32 s2, s10, s2 -; GFX90A-NEXT: s_add_i32 s10, s6, 1 +; GFX90A-NEXT: s_add_i32 s10, s8, 1 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s6, s10, s6 -; GFX90A-NEXT: s_lshr_b32 s9, s9, 16 -; GFX90A-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s9 +; GFX90A-NEXT: s_cselect_b32 s8, s10, s8 +; GFX90A-NEXT: s_lshr_b32 s7, s7, 16 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s7 ; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 +; GFX90A-NEXT: s_lshl_b64 s[12:13], s[8:9], 5 ; GFX90A-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 ; GFX90A-NEXT: s_or_b32 s10, s10, 28 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -735,12 +735,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 -; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8 -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 ; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] @@ -818,7 +818,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s4, s4, s6 +; GFX90A-NEXT: s_add_u32 s4, s4, s8 ; GFX90A-NEXT: s_addc_u32 s5, s5, 0 ; GFX90A-NEXT: s_add_u32 s10, s10, s12 ; GFX90A-NEXT: s_addc_u32 s11, s11, s13 diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll index 0a461f9ee6c968..b6c0271e5f56f6 100644 --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapture readonly, ptr addrspace(1) noalias nocapture readonly) { ; GCN-LABEL: readfirstlane_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll index 88203202a320da..c31b2ceed6688a 100644 --- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll +++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll @@ -41,16 +41,16 @@ define void @test1() { define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; GFX9-LABEL: test2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lt_i32 s0, 1 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB2_2: ; %then @@ -58,16 +58,16 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; ; GFX10-LABEL: test2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lt_i32 s0, 1 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; GFX10-NEXT: .LBB2_2: ; %then @@ -75,12 +75,12 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; ; GFX11-LABEL: test2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lt_i32 s0, 1 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index de318e7ae31a5b..598b4a5fcbd336 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -392,7 +392,7 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) { ; ; GCN-LABEL: select_add_lhs_const_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_movk_i32 s0, 0x80 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index d88719502d88fd..b7436aeb1d5302 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -39,7 +39,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: udiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -72,31 +72,31 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s0, 0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s4, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 -; GFX9-NEXT: s_mul_i32 s1, s0, s7 -; GFX9-NEXT: s_sub_i32 s1, s6, s1 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_sub_i32 s3, s1, s7 -; GFX9-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-NEXT: s_mul_i32 s5, s4, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_add_i32 s6, s4, 1 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_add_i32 s5, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -137,7 +137,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: urem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -167,29 +167,29 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s0, 0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s4, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s7 -; GFX9-NEXT: s_sub_i32 s0, s6, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, s7 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 -; GFX9-NEXT: s_cselect_b32 s0, s1, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, s7 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 -; GFX9-NEXT: s_cselect_b32 s0, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: s_sub_i32 s4, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_sub_i32 s4, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -241,7 +241,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: sdiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -280,37 +280,37 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s0, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_xor_b32 s1, s6, s7 -; GFX9-NEXT: s_abs_i32 s2, s6 -; GFX9-NEXT: s_sub_i32 s3, 0, s0 +; GFX9-NEXT: s_abs_i32 s4, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_sub_i32 s5, 0, s4 +; GFX9-NEXT: s_xor_b32 s3, s2, s3 +; GFX9-NEXT: s_abs_i32 s2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s1, s1, 31 +; GFX9-NEXT: s_ashr_i32 s3, s3, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s3, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s6, s6, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 -; GFX9-NEXT: s_mul_i32 s6, s3, s0 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_mul_i32 s6, s5, s4 ; GFX9-NEXT: s_sub_i32 s2, s2, s6 -; GFX9-NEXT: s_add_i32 s7, s3, 1 -; GFX9-NEXT: s_sub_i32 s6, s2, s0 -; GFX9-NEXT: s_cmp_ge_u32 s2, s0 -; GFX9-NEXT: s_cselect_b32 s3, s7, s3 +; GFX9-NEXT: s_add_i32 s7, s5, 1 +; GFX9-NEXT: s_sub_i32 s6, s2, s4 +; GFX9-NEXT: s_cmp_ge_u32 s2, s4 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 ; GFX9-NEXT: s_cselect_b32 s2, s6, s2 -; GFX9-NEXT: s_add_i32 s6, s3, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s0 -; GFX9-NEXT: s_cselect_b32 s0, s6, s3 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_add_i32 s6, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s4 +; GFX9-NEXT: s_cselect_b32 s2, s6, s5 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -359,7 +359,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: srem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -394,34 +394,34 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s0, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_ashr_i32 s1, s6, 31 -; GFX9-NEXT: s_abs_i32 s2, s6 -; GFX9-NEXT: s_sub_i32 s3, 0, s0 +; GFX9-NEXT: s_abs_i32 s3, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s5, 0, s3 +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_abs_i32 s2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s3, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s6, s6, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 -; GFX9-NEXT: s_mul_i32 s3, s3, s0 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s3, s2, s0 -; GFX9-NEXT: s_cmp_ge_u32 s2, s0 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s0 -; GFX9-NEXT: s_cmp_ge_u32 s2, s0 -; GFX9-NEXT: s_cselect_b32 s0, s3, s2 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_mul_i32 s5, s5, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -452,16 +452,16 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: udiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -474,14 +474,14 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s1, s0, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -523,37 +523,37 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: urem_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s5, s4, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_and_b32 s0, s4, 0xffff +; GFX6-NEXT: s_lshr_b32 s2, s6, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_and_b32 s0, s6, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_and_b32 s0, s4, 0xffff +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -562,8 +562,8 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -600,16 +600,16 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: sdiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s5, s4, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: s_sext_i32_i16 s4, s4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_ashr_i32 s4, s6, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: s_sext_i32_i16 s5, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 @@ -626,15 +626,15 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s4, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_ashr_i32 s3, s2, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s4, s2, 1 @@ -683,56 +683,56 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: srem_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s5, s4, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: s_sext_i32_i16 s2, s4 +; GFX6-NEXT: s_ashr_i32 s7, s6, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s7 +; GFX6-NEXT: s_sext_i32_i16 s2, s6 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GFX6-NEXT: s_xor_b32 s2, s2, s5 +; GFX6-NEXT: s_xor_b32 s2, s2, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s2, s2, 30 -; GFX6-NEXT: s_or_b32 s6, s2, 1 +; GFX6-NEXT: s_or_b32 s4, s2, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| ; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX6-NEXT: s_cselect_b32 s2, s6, 0 +; GFX6-NEXT: s_cselect_b32 s2, s4, 0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s7 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s4, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX9-NEXT: s_sext_i32_i16 s2, s4 +; GFX9-NEXT: s_ashr_i32 s7, s6, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s7 +; GFX9-NEXT: s_sext_i32_i16 s2, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GFX9-NEXT: s_xor_b32 s2, s2, s5 +; GFX9-NEXT: s_xor_b32 s2, s2, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s6, 0 +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0 ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i16 %x, %y @@ -764,14 +764,14 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: udiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 +; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 @@ -783,13 +783,13 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 @@ -829,14 +829,14 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: urem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 +; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 -; GFX6-NEXT: s_lshr_b32 s2, s4, 8 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX6-NEXT: s_lshr_b32 s2, s6, 8 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 @@ -845,19 +845,19 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 -; GFX9-NEXT: s_lshr_b32 s2, s4, 8 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 @@ -865,8 +865,8 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i8 %x, %y @@ -902,16 +902,16 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: sdiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s5, s4, 0x80008 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: s_sext_i32_i8 s4, s4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_bfe_i32 s4, s6, 0x80008 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: s_sext_i32_i8 s5, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 @@ -928,15 +928,15 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i8 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_i32 s3, s2, 0x80008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: s_sext_i32_i8 s2, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s4, s2, 1 @@ -985,58 +985,58 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: srem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s2, s4, 0x80008 +; GFX6-NEXT: s_bfe_i32 s2, s6, 0x80008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX6-NEXT: s_sext_i32_i8 s3, s4 +; GFX6-NEXT: s_sext_i32_i8 s3, s6 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GFX6-NEXT: s_xor_b32 s2, s3, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s2, s2, 30 -; GFX6-NEXT: s_lshr_b32 s5, s4, 8 -; GFX6-NEXT: s_or_b32 s6, s2, 1 +; GFX6-NEXT: s_lshr_b32 s4, s6, 8 +; GFX6-NEXT: s_or_b32 s5, s2, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| ; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX6-NEXT: s_cselect_b32 s2, s6, 0 +; GFX6-NEXT: s_cselect_b32 s2, s5, 0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 +; GFX9-NEXT: s_bfe_i32 s2, s6, 0x80008 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i8 s3, s4 +; GFX9-NEXT: s_sext_i32_i8 s3, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_lshr_b32 s5, s4, 8 -; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_lshr_b32 s4, s6, 8 +; GFX9-NEXT: s_or_b32 s5, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s6, 0 +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i8 %x, %y @@ -1179,8 +1179,8 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: udiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1211,8 +1211,8 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s0, s12 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_sub_i32 s4, 0, s13 -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX6-NEXT: s_sub_i32 s2, 0, s13 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 @@ -1221,22 +1221,22 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_readfirstlane_b32 s4, v1 -; GFX6-NEXT: s_mul_i32 s4, s4, s13 -; GFX6-NEXT: s_sub_i32 s4, s9, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s13 -; GFX6-NEXT: s_cmp_ge_u32 s4, s13 +; GFX6-NEXT: v_readfirstlane_b32 s2, v1 +; GFX6-NEXT: s_mul_i32 s2, s2, s13 +; GFX6-NEXT: s_sub_i32 s2, s9, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s13 +; GFX6-NEXT: s_cmp_ge_u32 s2, s13 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s4, s13 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s2, s13 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_sub_i32 s6, 0, s14 ; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 ; GFX6-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v3 ; GFX6-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v6 @@ -1277,14 +1277,14 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX9-LABEL: udiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_sub_i32 s2, 0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX9-NEXT: s_sub_i32 s2, 0, s12 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s14 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -1295,71 +1295,71 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_i32 s2, s2, s3 ; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 ; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 -; GFX9-NEXT: s_mul_i32 s3, s2, s8 -; GFX9-NEXT: s_sub_i32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s13, s2, 1 -; GFX9-NEXT: s_sub_i32 s4, s3, s8 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s2, s13, s2 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s4, s2, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s9 -; GFX9-NEXT: s_mul_i32 s3, s3, s12 -; GFX9-NEXT: s_mul_hi_u32 s3, s12, s3 -; GFX9-NEXT: s_add_i32 s12, s12, s3 +; GFX9-NEXT: s_mul_hi_u32 s2, s8, s3 +; GFX9-NEXT: s_mul_i32 s3, s2, s12 +; GFX9-NEXT: s_sub_i32 s3, s8, s3 +; GFX9-NEXT: s_add_i32 s5, s2, 1 +; GFX9-NEXT: s_sub_i32 s6, s3, s12 +; GFX9-NEXT: s_cmp_ge_u32 s3, s12 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_cselect_b32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s5, s2, 1 +; GFX9-NEXT: s_cmp_ge_u32 s3, s12 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_sub_i32 s3, 0, s13 +; GFX9-NEXT: s_mul_i32 s3, s3, s4 +; GFX9-NEXT: s_mul_hi_u32 s3, s4, s3 +; GFX9-NEXT: s_add_i32 s4, s4, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v2 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s12 -; GFX9-NEXT: s_mul_i32 s4, s3, s9 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s8, s3, 1 -; GFX9-NEXT: s_sub_i32 s5, s4, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s9, s4 +; GFX9-NEXT: s_mul_i32 s4, s3, s13 +; GFX9-NEXT: s_sub_i32 s4, s9, s4 +; GFX9-NEXT: s_add_i32 s5, s3, 1 +; GFX9-NEXT: s_sub_i32 s6, s4, s13 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s9 +; GFX9-NEXT: s_cmp_ge_u32 s4, s13 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 ; GFX9-NEXT: s_add_i32 s5, s3, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s9 +; GFX9-NEXT: s_cmp_ge_u32 s4, s13 ; GFX9-NEXT: s_cselect_b32 s3, s5, s3 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 -; GFX9-NEXT: s_sub_i32 s4, 0, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15 +; GFX9-NEXT: s_sub_i32 s4, 0, s14 ; GFX9-NEXT: s_mul_i32 s4, s4, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_mul_hi_u32 s4, s6, s5 -; GFX9-NEXT: s_mul_i32 s5, s4, s10 -; GFX9-NEXT: s_sub_i32 s5, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s10, s5 +; GFX9-NEXT: s_mul_i32 s5, s4, s14 +; GFX9-NEXT: s_sub_i32 s5, s10, s5 ; GFX9-NEXT: s_add_i32 s6, s4, 1 -; GFX9-NEXT: s_sub_i32 s8, s5, s10 +; GFX9-NEXT: s_sub_i32 s7, s5, s14 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: s_cmp_ge_u32 s5, s10 +; GFX9-NEXT: s_cmp_ge_u32 s5, s14 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_cselect_b32 s5, s8, s5 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 ; GFX9-NEXT: s_add_i32 s6, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s10 +; GFX9-NEXT: s_cmp_ge_u32 s5, s14 ; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_sub_i32 s5, 0, s11 +; GFX9-NEXT: s_sub_i32 s5, 0, s15 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 ; GFX9-NEXT: s_mul_i32 s5, s5, s6 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 ; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s7, s6 -; GFX9-NEXT: s_mul_i32 s6, s5, s11 -; GFX9-NEXT: s_sub_i32 s6, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s11, s6 +; GFX9-NEXT: s_mul_i32 s6, s5, s15 +; GFX9-NEXT: s_sub_i32 s6, s11, s6 ; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_sub_i32 s8, s6, s11 -; GFX9-NEXT: s_cmp_ge_u32 s6, s11 +; GFX9-NEXT: s_sub_i32 s8, s6, s15 +; GFX9-NEXT: s_cmp_ge_u32 s6, s15 ; GFX9-NEXT: s_cselect_b32 s5, s7, s5 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 ; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s6, s11 +; GFX9-NEXT: s_cmp_ge_u32 s6, s15 ; GFX9-NEXT: s_cselect_b32 s5, s7, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1499,14 +1499,13 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: urem_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX6-NEXT: s_sub_i32 s0, 0, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -1514,85 +1513,87 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s14 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s8 -; GFX6-NEXT: s_sub_i32 s0, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_mul_i32 s0, s0, s12 +; GFX6-NEXT: s_sub_i32 s0, s8, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s12 +; GFX6-NEXT: s_cmp_ge_u32 s0, s12 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v0, s1, v1 +; GFX6-NEXT: s_sub_i32 s1, s0, s12 +; GFX6-NEXT: s_cmp_ge_u32 s0, s12 +; GFX6-NEXT: s_cselect_b32 s6, s1, s0 +; GFX6-NEXT: s_sub_i32 s0, 0, s13 +; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s9 -; GFX6-NEXT: s_sub_i32 s1, s5, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, 0, s10 -; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s15 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s13 +; GFX6-NEXT: s_sub_i32 s0, s9, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s13 +; GFX6-NEXT: s_cmp_ge_u32 s0, s13 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s13 +; GFX6-NEXT: s_cmp_ge_u32 s0, s13 +; GFX6-NEXT: s_cselect_b32 s7, s1, s0 +; GFX6-NEXT: s_sub_i32 s0, 0, s14 +; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s10 -; GFX6-NEXT: s_sub_i32 s4, s6, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, 0, s11 -; GFX6-NEXT: v_mul_lo_u32 v0, s5, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s14 +; GFX6-NEXT: s_sub_i32 s0, s10, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s14 +; GFX6-NEXT: s_cmp_ge_u32 s0, s14 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s14 +; GFX6-NEXT: s_cmp_ge_u32 s0, s14 +; GFX6-NEXT: s_cselect_b32 s8, s1, s0 +; GFX6-NEXT: s_sub_i32 s0, 0, s15 +; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v2, s7, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v2 -; GFX6-NEXT: s_mul_i32 s0, s0, s11 -; GFX6-NEXT: s_sub_i32 s0, s7, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: v_mul_hi_u32 v2, s11, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: s_mul_i32 s4, s4, s15 +; GFX6-NEXT: s_sub_i32 s4, s11, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s15 +; GFX6-NEXT: s_cmp_ge_u32 s4, s15 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s15 +; GFX6-NEXT: s_cmp_ge_u32 s4, s15 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_sub_i32 s2, 0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX9-NEXT: s_sub_i32 s2, 0, s12 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s14 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -1605,61 +1606,61 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_i32 s2, s2, s3 ; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 ; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 -; GFX9-NEXT: s_mul_i32 s2, s2, s8 -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s8 -; GFX9-NEXT: s_cmp_ge_u32 s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s2, s8, s3 +; GFX9-NEXT: s_mul_i32 s2, s2, s12 +; GFX9-NEXT: s_sub_i32 s2, s8, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s12 +; GFX9-NEXT: s_cmp_ge_u32 s2, s12 ; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s8 -; GFX9-NEXT: s_cmp_ge_u32 s2, s8 -; GFX9-NEXT: v_readfirstlane_b32 s12, v1 +; GFX9-NEXT: s_sub_i32 s3, s2, s12 +; GFX9-NEXT: s_cmp_ge_u32 s2, s12 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s9 -; GFX9-NEXT: s_mul_i32 s3, s3, s12 -; GFX9-NEXT: s_mul_hi_u32 s3, s12, s3 -; GFX9-NEXT: s_add_i32 s12, s12, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s12 -; GFX9-NEXT: s_mul_i32 s3, s3, s9 -; GFX9-NEXT: s_sub_i32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s4, s3, s9 +; GFX9-NEXT: s_sub_i32 s3, 0, s13 +; GFX9-NEXT: s_mul_i32 s3, s3, s4 +; GFX9-NEXT: s_mul_hi_u32 s3, s4, s3 +; GFX9-NEXT: s_add_i32 s4, s4, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s9, s4 +; GFX9-NEXT: s_mul_i32 s3, s3, s13 +; GFX9-NEXT: s_sub_i32 s3, s9, s3 +; GFX9-NEXT: s_sub_i32 s4, s3, s13 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_cmp_ge_u32 s3, s9 +; GFX9-NEXT: s_cmp_ge_u32 s3, s13 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_sub_i32 s4, s3, s9 -; GFX9-NEXT: s_cmp_ge_u32 s3, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 +; GFX9-NEXT: s_sub_i32 s4, s3, s13 +; GFX9-NEXT: s_cmp_ge_u32 s3, s13 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s10 +; GFX9-NEXT: s_sub_i32 s4, 0, s14 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: s_mul_i32 s4, s4, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_mul_hi_u32 s4, s6, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s10 -; GFX9-NEXT: s_sub_i32 s4, s6, s4 -; GFX9-NEXT: s_sub_i32 s5, s4, s10 +; GFX9-NEXT: s_mul_hi_u32 s4, s10, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s14 +; GFX9-NEXT: s_sub_i32 s4, s10, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s14 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s10 +; GFX9-NEXT: s_cmp_ge_u32 s4, s14 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_sub_i32 s5, s4, s10 -; GFX9-NEXT: s_cmp_ge_u32 s4, s10 +; GFX9-NEXT: s_sub_i32 s5, s4, s14 +; GFX9-NEXT: s_cmp_ge_u32 s4, s14 ; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_sub_i32 s5, 0, s11 +; GFX9-NEXT: s_sub_i32 s5, 0, s15 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 ; GFX9-NEXT: s_mul_i32 s5, s5, s6 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 ; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s7, s6 -; GFX9-NEXT: s_mul_i32 s5, s5, s11 -; GFX9-NEXT: s_sub_i32 s5, s7, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s11 -; GFX9-NEXT: s_cmp_ge_u32 s5, s11 +; GFX9-NEXT: s_mul_hi_u32 s5, s11, s6 +; GFX9-NEXT: s_mul_i32 s5, s5, s15 +; GFX9-NEXT: s_sub_i32 s5, s11, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s15 +; GFX9-NEXT: s_cmp_ge_u32 s5, s15 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s11 -; GFX9-NEXT: s_cmp_ge_u32 s5, s11 +; GFX9-NEXT: s_sub_i32 s6, s5, s15 +; GFX9-NEXT: s_cmp_ge_u32 s5, s15 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1843,37 +1844,37 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_abs_i32 s0, s12 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX6-NEXT: s_sub_i32 s1, 0, s0 -; GFX6-NEXT: s_xor_b32 s4, s8, s12 +; GFX6-NEXT: s_xor_b32 s2, s8, s12 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX6-NEXT: s_abs_i32 s1, s8 -; GFX6-NEXT: s_ashr_i32 s8, s4, 31 +; GFX6-NEXT: s_ashr_i32 s8, s2, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s1, s4 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s0 +; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: s_sub_i32 s2, s1, s0 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_abs_i32 s4, s13 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GFX6-NEXT: s_sub_i32 s5, 0, s4 +; GFX6-NEXT: s_abs_i32 s2, s13 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: s_xor_b32 s6, s9, s13 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -1882,22 +1883,22 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_mul_lo_u32 v3, s5, v2 -; GFX6-NEXT: s_abs_i32 s5, s9 +; GFX6-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX6-NEXT: s_abs_i32 s3, s9 ; GFX6-NEXT: s_ashr_i32 s9, s6, 31 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s3, v2 ; GFX6-NEXT: v_readfirstlane_b32 s6, v2 -; GFX6-NEXT: s_mul_i32 s6, s6, s4 -; GFX6-NEXT: s_sub_i32 s5, s5, s6 -; GFX6-NEXT: s_sub_i32 s6, s5, s4 -; GFX6-NEXT: s_cmp_ge_u32 s5, s4 +; GFX6-NEXT: s_mul_i32 s6, s6, s2 +; GFX6-NEXT: s_sub_i32 s3, s3, s6 +; GFX6-NEXT: s_sub_i32 s6, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; GFX6-NEXT: s_cselect_b32 s5, s6, s5 +; GFX6-NEXT: s_cselect_b32 s3, s6, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s5, s4 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_abs_i32 s6, s14 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GFX6-NEXT: s_sub_i32 s7, 0, s6 @@ -1933,7 +1934,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[2:3] ; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v5, s[6:7] ; GFX6-NEXT: v_xor_b32_e32 v1, s9, v1 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v6 @@ -1965,118 +1966,118 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s0, s8 +; GFX9-NEXT: s_abs_i32 s0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_xor_b32 s1, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, 0, s0 -; GFX9-NEXT: s_abs_i32 s4, s4 +; GFX9-NEXT: s_sub_i32 s3, 0, s0 +; GFX9-NEXT: s_abs_i32 s2, s8 +; GFX9-NEXT: s_xor_b32 s1, s8, s12 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s1, s1, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s12, v0 -; GFX9-NEXT: s_mul_i32 s8, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 -; GFX9-NEXT: s_add_i32 s12, s12, s8 -; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s12, s8, s0 -; GFX9-NEXT: s_sub_i32 s4, s4, s12 -; GFX9-NEXT: s_add_i32 s13, s8, 1 -; GFX9-NEXT: s_sub_i32 s12, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s8, s13, s8 -; GFX9-NEXT: s_cselect_b32 s4, s12, s4 -; GFX9-NEXT: s_add_i32 s12, s8, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s0, s12, s8 -; GFX9-NEXT: s_abs_i32 s4, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s3, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 +; GFX9-NEXT: s_mul_i32 s6, s3, s0 +; GFX9-NEXT: s_sub_i32 s2, s2, s6 +; GFX9-NEXT: s_add_i32 s7, s3, 1 +; GFX9-NEXT: s_sub_i32 s6, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s3, s7, s3 +; GFX9-NEXT: s_cselect_b32 s2, s6, s2 +; GFX9-NEXT: s_add_i32 s6, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s0, s6, s3 +; GFX9-NEXT: s_abs_i32 s2, s13 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_xor_b32 s8, s5, s9 -; GFX9-NEXT: s_sub_i32 s9, 0, s4 +; GFX9-NEXT: s_sub_i32 s7, 0, s2 +; GFX9-NEXT: s_sub_i32 s8, s0, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s12, s0, s1 -; GFX9-NEXT: s_abs_i32 s5, s5 -; GFX9-NEXT: s_ashr_i32 s8, s8, 31 +; GFX9-NEXT: s_abs_i32 s6, s9 +; GFX9-NEXT: s_xor_b32 s3, s9, s13 +; GFX9-NEXT: s_ashr_i32 s3, s3, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s0 -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s9 +; GFX9-NEXT: s_mul_i32 s7, s7, s0 +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s7 ; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s1, s0, s4 -; GFX9-NEXT: s_sub_i32 s1, s5, s1 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_sub_i32 s5, s1, s4 -; GFX9-NEXT: s_cmp_ge_u32 s1, s4 -; GFX9-NEXT: s_cselect_b32 s0, s9, s0 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 -; GFX9-NEXT: s_add_i32 s5, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s4 -; GFX9-NEXT: s_cselect_b32 s0, s5, s0 -; GFX9-NEXT: s_abs_i32 s1, s10 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0 +; GFX9-NEXT: s_mul_i32 s1, s0, s2 +; GFX9-NEXT: s_sub_i32 s1, s6, s1 +; GFX9-NEXT: s_add_i32 s7, s0, 1 +; GFX9-NEXT: s_sub_i32 s6, s1, s2 +; GFX9-NEXT: s_cmp_ge_u32 s1, s2 +; GFX9-NEXT: s_cselect_b32 s0, s7, s0 +; GFX9-NEXT: s_cselect_b32 s1, s6, s1 +; GFX9-NEXT: s_add_i32 s6, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s1, s2 +; GFX9-NEXT: s_cselect_b32 s0, s6, s0 +; GFX9-NEXT: s_abs_i32 s1, s14 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s6, s10 -; GFX9-NEXT: s_abs_i32 s5, s6 +; GFX9-NEXT: s_xor_b32 s0, s0, s3 +; GFX9-NEXT: s_sub_i32 s7, 0, s1 +; GFX9-NEXT: s_sub_i32 s3, s0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, 0, s1 -; GFX9-NEXT: s_sub_i32 s8, s0, s8 -; GFX9-NEXT: s_ashr_i32 s4, s4, 31 +; GFX9-NEXT: s_abs_i32 s6, s10 +; GFX9-NEXT: s_xor_b32 s2, s10, s14 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s0 -; GFX9-NEXT: s_mul_hi_u32 s6, s0, s6 -; GFX9-NEXT: s_add_i32 s0, s0, s6 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s6, s0, s1 -; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_i32 s7, s7, s0 +; GFX9-NEXT: s_mul_hi_u32 s7, s0, s7 +; GFX9-NEXT: s_add_i32 s0, s0, s7 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0 +; GFX9-NEXT: s_mul_i32 s7, s0, s1 +; GFX9-NEXT: s_sub_i32 s6, s6, s7 ; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_sub_i32 s6, s5, s1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s1 +; GFX9-NEXT: s_sub_i32 s7, s6, s1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s1 ; GFX9-NEXT: s_cselect_b32 s0, s9, s0 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s1 -; GFX9-NEXT: s_cselect_b32 s5, s6, s0 -; GFX9-NEXT: s_abs_i32 s6, s11 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_xor_b32 s2, s7, s11 +; GFX9-NEXT: s_cselect_b32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s1 +; GFX9-NEXT: s_cselect_b32 s6, s7, s0 +; GFX9-NEXT: s_abs_i32 s7, s15 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s7 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_xor_b32 s5, s6, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_abs_i32 s3, s7 -; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s2, s5, s2 +; GFX9-NEXT: s_abs_i32 s4, s11 +; GFX9-NEXT: s_xor_b32 s3, s11, s15 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_ashr_i32 s2, s2, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_ashr_i32 s3, s3, 31 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: s_mul_i32 s7, s7, s5 -; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 -; GFX9-NEXT: s_add_i32 s5, s5, s7 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 -; GFX9-NEXT: s_mul_i32 s7, s5, s6 -; GFX9-NEXT: s_sub_i32 s3, s3, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 +; GFX9-NEXT: s_add_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s5 +; GFX9-NEXT: s_mul_i32 s6, s5, s7 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 ; GFX9-NEXT: s_add_i32 s8, s5, 1 -; GFX9-NEXT: s_sub_i32 s7, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_sub_i32 s6, s4, s7 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 ; GFX9-NEXT: s_cselect_b32 s5, s8, s5 -; GFX9-NEXT: s_cselect_b32 s3, s7, s3 -; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s3, s7, s5 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_add_i32 s6, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s4, s6, s5 +; GFX9-NEXT: s_xor_b32 s4, s4, s3 +; GFX9-NEXT: s_sub_i32 s3, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm @@ -2244,213 +2245,213 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: srem_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s0, s8 +; GFX6-NEXT: s_abs_i32 s0, s12 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_ashr_i32 s2, s8, 31 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX6-NEXT: s_abs_i32 s1, s4 -; GFX6-NEXT: s_ashr_i32 s4, s4, 31 +; GFX6-NEXT: s_abs_i32 s1, s8 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: s_mul_i32 s8, s8, s0 -; GFX6-NEXT: s_sub_i32 s1, s1, s8 -; GFX6-NEXT: s_sub_i32 s8, s1, s0 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_mul_i32 s3, s3, s0 +; GFX6-NEXT: s_sub_i32 s1, s1, s3 +; GFX6-NEXT: s_sub_i32 s3, s1, s0 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s1, s8, s1 -; GFX6-NEXT: s_sub_i32 s8, s1, s0 +; GFX6-NEXT: s_cselect_b32 s1, s3, s1 +; GFX6-NEXT: s_sub_i32 s3, s1, s0 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s0, s8, s1 -; GFX6-NEXT: s_abs_i32 s1, s9 +; GFX6-NEXT: s_cselect_b32 s0, s3, s1 +; GFX6-NEXT: s_abs_i32 s1, s13 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX6-NEXT: s_sub_i32 s8, 0, s1 -; GFX6-NEXT: s_xor_b32 s0, s0, s4 -; GFX6-NEXT: s_sub_i32 s0, s0, s4 +; GFX6-NEXT: s_sub_i32 s3, 0, s1 +; GFX6-NEXT: s_xor_b32 s0, s0, s2 +; GFX6-NEXT: s_sub_i32 s7, s0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_ashr_i32 s6, s9, 31 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s8, v0 -; GFX6-NEXT: s_abs_i32 s8, s5 -; GFX6-NEXT: s_ashr_i32 s5, s5, 31 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_abs_i32 s3, s9 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 -; GFX6-NEXT: s_sub_i32 s8, s4, s1 -; GFX6-NEXT: s_cmp_ge_u32 s4, s1 -; GFX6-NEXT: s_cselect_b32 s4, s8, s4 -; GFX6-NEXT: s_sub_i32 s8, s4, s1 -; GFX6-NEXT: s_cmp_ge_u32 s4, s1 -; GFX6-NEXT: s_cselect_b32 s1, s8, s4 -; GFX6-NEXT: s_abs_i32 s4, s10 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX6-NEXT: s_sub_i32 s8, 0, s4 -; GFX6-NEXT: s_xor_b32 s1, s1, s5 -; GFX6-NEXT: s_sub_i32 s1, s1, s5 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s3, s0 +; GFX6-NEXT: s_sub_i32 s2, s0, s1 +; GFX6-NEXT: s_cmp_ge_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s0, s2, s0 +; GFX6-NEXT: s_sub_i32 s2, s0, s1 +; GFX6-NEXT: s_cmp_ge_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s0, s2, s0 +; GFX6-NEXT: s_abs_i32 s1, s14 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX6-NEXT: s_sub_i32 s2, 0, s1 +; GFX6-NEXT: s_xor_b32 s0, s0, s6 +; GFX6-NEXT: s_sub_i32 s6, s0, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_ashr_i32 s8, s10, 31 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s8, v0 -; GFX6-NEXT: s_abs_i32 s8, s6 -; GFX6-NEXT: s_ashr_i32 s6, s6, 31 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_abs_i32 s2, s10 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s5, v0 -; GFX6-NEXT: s_mul_i32 s5, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_sub_i32 s8, s5, s4 -; GFX6-NEXT: s_cmp_ge_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s5, s8, s5 -; GFX6-NEXT: s_sub_i32 s8, s5, s4 -; GFX6-NEXT: s_cmp_ge_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s4, s8, s5 -; GFX6-NEXT: s_abs_i32 s5, s11 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_sub_i32 s8, 0, s5 -; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s2, s0 +; GFX6-NEXT: s_sub_i32 s2, s0, s1 +; GFX6-NEXT: s_cmp_ge_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s0, s2, s0 +; GFX6-NEXT: s_sub_i32 s2, s0, s1 +; GFX6-NEXT: s_cmp_ge_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s9, s2, s0 +; GFX6-NEXT: s_abs_i32 s10, s15 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX6-NEXT: s_sub_i32 s0, 0, s10 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_abs_i32 s0, s7 -; GFX6-NEXT: v_mul_lo_u32 v1, s8, v2 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; GFX6-NEXT: s_xor_b32 s2, s4, s6 -; GFX6-NEXT: s_sub_i32 s2, s2, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_abs_i32 s4, s11 +; GFX6-NEXT: s_ashr_i32 s5, s11, 31 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_ashr_i32 s1, s7, 31 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: s_xor_b32 s6, s9, s8 +; GFX6-NEXT: s_sub_i32 s6, s6, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v2, s0, v2 -; GFX6-NEXT: v_readfirstlane_b32 s3, v2 -; GFX6-NEXT: s_mul_i32 s3, s3, s5 -; GFX6-NEXT: s_sub_i32 s0, s0, s3 -; GFX6-NEXT: s_sub_i32 s3, s0, s5 -; GFX6-NEXT: s_cmp_ge_u32 s0, s5 -; GFX6-NEXT: s_cselect_b32 s0, s3, s0 -; GFX6-NEXT: s_sub_i32 s3, s0, s5 -; GFX6-NEXT: s_cmp_ge_u32 s0, s5 -; GFX6-NEXT: s_cselect_b32 s0, s3, s0 -; GFX6-NEXT: s_xor_b32 s0, s0, s1 -; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: v_mul_hi_u32 v2, s4, v2 +; GFX6-NEXT: v_readfirstlane_b32 s7, v2 +; GFX6-NEXT: s_mul_i32 s7, s7, s10 +; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_sub_i32 s7, s4, s10 +; GFX6-NEXT: s_cmp_ge_u32 s4, s10 +; GFX6-NEXT: s_cselect_b32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s7, s4, s10 +; GFX6-NEXT: s_cmp_ge_u32 s4, s10 +; GFX6-NEXT: s_cselect_b32 s4, s7, s4 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s0, s8 +; GFX9-NEXT: s_abs_i32 s0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_sub_i32 s8, 0, s0 -; GFX9-NEXT: s_ashr_i32 s1, s4, 31 -; GFX9-NEXT: s_abs_i32 s4, s4 +; GFX9-NEXT: s_sub_i32 s3, 0, s0 +; GFX9-NEXT: s_abs_i32 s2, s8 +; GFX9-NEXT: s_ashr_i32 s1, s8, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s12, v0 -; GFX9-NEXT: s_mul_i32 s8, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 -; GFX9-NEXT: s_add_i32 s12, s12, s8 -; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s8, s8, s0 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_sub_i32 s8, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s0, s8, s4 -; GFX9-NEXT: s_abs_i32 s4, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s3, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 +; GFX9-NEXT: s_mul_i32 s3, s3, s0 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s0, s3, s2 +; GFX9-NEXT: s_abs_i32 s2, s13 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s9, 0, s4 -; GFX9-NEXT: s_sub_i32 s12, s0, s1 +; GFX9-NEXT: s_sub_i32 s7, 0, s2 +; GFX9-NEXT: s_sub_i32 s8, s0, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s8, s5, 31 -; GFX9-NEXT: s_abs_i32 s5, s5 +; GFX9-NEXT: s_abs_i32 s6, s9 +; GFX9-NEXT: s_ashr_i32 s3, s9, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s0 -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s9 +; GFX9-NEXT: s_mul_i32 s7, s7, s0 +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s7 ; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s0, s5, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, s4 -; GFX9-NEXT: s_cmp_ge_u32 s0, s4 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s2 +; GFX9-NEXT: s_sub_i32 s0, s6, s0 +; GFX9-NEXT: s_sub_i32 s1, s0, s2 +; GFX9-NEXT: s_cmp_ge_u32 s0, s2 ; GFX9-NEXT: s_cselect_b32 s0, s1, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, s4 -; GFX9-NEXT: s_cmp_ge_u32 s0, s4 +; GFX9-NEXT: s_sub_i32 s1, s0, s2 +; GFX9-NEXT: s_cmp_ge_u32 s0, s2 ; GFX9-NEXT: s_cselect_b32 s0, s1, s0 -; GFX9-NEXT: s_abs_i32 s1, s10 +; GFX9-NEXT: s_abs_i32 s1, s14 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s8 -; GFX9-NEXT: s_ashr_i32 s4, s6, 31 -; GFX9-NEXT: s_abs_i32 s5, s6 +; GFX9-NEXT: s_xor_b32 s0, s0, s3 +; GFX9-NEXT: s_sub_i32 s7, 0, s1 +; GFX9-NEXT: s_sub_i32 s3, s0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, 0, s1 -; GFX9-NEXT: s_sub_i32 s8, s0, s8 +; GFX9-NEXT: s_abs_i32 s6, s10 +; GFX9-NEXT: s_ashr_i32 s2, s10, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s0 -; GFX9-NEXT: s_mul_hi_u32 s6, s0, s6 -; GFX9-NEXT: s_add_i32 s0, s0, s6 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 +; GFX9-NEXT: s_mul_i32 s7, s7, s0 +; GFX9-NEXT: s_mul_hi_u32 s7, s0, s7 +; GFX9-NEXT: s_add_i32 s0, s0, s7 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0 ; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s5, s0 -; GFX9-NEXT: s_sub_i32 s5, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s6, s0 +; GFX9-NEXT: s_sub_i32 s6, s0, s1 ; GFX9-NEXT: s_cmp_ge_u32 s0, s1 -; GFX9-NEXT: s_cselect_b32 s0, s5, s0 -; GFX9-NEXT: s_sub_i32 s5, s0, s1 +; GFX9-NEXT: s_cselect_b32 s0, s6, s0 +; GFX9-NEXT: s_sub_i32 s6, s0, s1 ; GFX9-NEXT: s_cmp_ge_u32 s0, s1 -; GFX9-NEXT: s_cselect_b32 s5, s5, s0 -; GFX9-NEXT: s_abs_i32 s6, s11 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 +; GFX9-NEXT: s_cselect_b32 s6, s6, s0 +; GFX9-NEXT: s_abs_i32 s7, s15 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_xor_b32 s5, s6, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GFX9-NEXT: s_abs_i32 s3, s7 -; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s2, s5, s2 +; GFX9-NEXT: s_abs_i32 s4, s11 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_ashr_i32 s3, s11, 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: s_mul_i32 s7, s7, s5 -; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 -; GFX9-NEXT: s_add_i32 s5, s5, s7 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_sub_i32 s5, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s5, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_mul_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 +; GFX9-NEXT: s_add_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s5 +; GFX9-NEXT: s_mul_i32 s5, s5, s7 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: s_sub_i32 s5, s4, s7 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s7 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_xor_b32 s4, s4, s3 +; GFX9-NEXT: s_sub_i32 s3, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm @@ -2546,19 +2547,19 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: udiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s6, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_lshr_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s8, s4, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_lshr_b32 s5, s10, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_lshr_b32 s4, s4, 16 +; GFX6-NEXT: s_lshr_b32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -2567,11 +2568,11 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: s_and_b32 s4, s11, 0xffff ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: s_and_b32 s4, s5, 0xffff +; GFX6-NEXT: s_and_b32 s4, s9, 0xffff ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 @@ -2580,10 +2581,10 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_lshr_b32 s4, s7, 16 +; GFX6-NEXT: s_lshr_b32 s4, s11, 16 ; GFX6-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: s_lshr_b32 s4, s5, 16 +; GFX6-NEXT: s_lshr_b32 s4, s9, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 @@ -2606,43 +2607,42 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s0, s4, 0xffff -; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX9-NEXT: s_and_b32 s7, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_and_b32 s6, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s2, s7, 0xffff +; GFX9-NEXT: s_and_b32 s0, s3, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: s_and_b32 s2, s5, 0xffff +; GFX9-NEXT: s_and_b32 s0, s1, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX9-NEXT: s_lshr_b32 s2, s7, 16 +; GFX9-NEXT: s_lshr_b32 s0, s3, 16 ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s2, s5, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 +; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -2650,6 +2650,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 @@ -2659,7 +2660,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -2761,49 +2762,49 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: urem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s6, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_and_b32 s8, s4, 0xffff -; GFX6-NEXT: s_lshr_b32 s9, s6, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_lshr_b32 s5, s10, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: s_lshr_b32 s4, s8, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX6-NEXT: v_mad_f32 v1, -v1, v2, v4 -; GFX6-NEXT: s_and_b32 s6, s7, 0xffff +; GFX6-NEXT: s_and_b32 s6, s11, 0xffff ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX6-NEXT: s_and_b32 s6, s5, 0xffff -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 +; GFX6-NEXT: s_and_b32 s5, s9, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: s_lshr_b32 s4, s7, 16 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s8, v1 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v1 +; GFX6-NEXT: s_lshr_b32 s4, s11, 16 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: s_lshr_b32 s6, s5, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s6 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 +; GFX6-NEXT: s_lshr_b32 s5, s9, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 @@ -2814,11 +2815,11 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2829,45 +2830,45 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s6, 0xffff +; GFX9-NEXT: s_and_b32 s9, s2, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_and_b32 s8, s4, 0xffff -; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s8, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s2, s7, 0xffff +; GFX9-NEXT: s_and_b32 s4, s3, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: s_and_b32 s5, s1, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s6, s7, 16 +; GFX9-NEXT: s_lshr_b32 s2, s3, 16 ; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 -; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 @@ -2878,18 +2879,18 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 ; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 -; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v1, s3, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 +; GFX9-NEXT: v_sub_u32_e32 v4, s0, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm %r = urem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -2999,65 +3000,65 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s8, s6 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX6-NEXT: s_sext_i32_i16 s9, s4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX6-NEXT: s_xor_b32 s8, s9, s8 +; GFX6-NEXT: s_sext_i32_i16 s4, s10 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: s_sext_i32_i16 s5, s8 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s8, s8, 30 -; GFX6-NEXT: s_or_b32 s10, s8, 1 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX6-NEXT: s_cselect_b32 s8, s10, 0 -; GFX6-NEXT: s_ashr_i32 s6, s6, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_ashr_i32 s5, s10, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: s_ashr_i32 s4, s8, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s6 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_sext_i32_i16 s6, s7 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s8, v2 -; GFX6-NEXT: s_or_b32 s4, s4, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: s_sext_i32_i16 s5, s11 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s5 +; GFX6-NEXT: s_sext_i32_i16 s4, s9 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s6 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: s_cselect_b32 s4, s4, 0 -; GFX6-NEXT: s_ashr_i32 s6, s7, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_ashr_i32 s5, s11, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 -; GFX6-NEXT: s_ashr_i32 s4, s5, 16 +; GFX6-NEXT: s_ashr_i32 s4, s9, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s6 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 @@ -3079,79 +3080,79 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX9-NEXT: s_sext_i32_i16 s5, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX9-NEXT: s_xor_b32 s4, s5, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s8, s2, 1 +; GFX9-NEXT: s_ashr_i32 s4, s4, 30 +; GFX9-NEXT: s_or_b32 s8, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: s_ashr_i32 s3, s6, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s8, 0 +; GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v3 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_sext_i32_i16 s2, s3 ; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 -; GFX9-NEXT: s_xor_b32 s2, s4, s3 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: s_or_b32 s4, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s3, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s2, v4 -; GFX9-NEXT: s_sext_i32_i16 s2, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX9-NEXT: v_add_u32_e32 v3, s4, v3 +; GFX9-NEXT: s_or_b32 s0, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 +; GFX9-NEXT: s_sext_i32_i16 s0, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s0, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: s_ashr_i32 s3, s7, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: v_add_u32_e32 v1, s2, v5 -; GFX9-NEXT: s_ashr_i32 s2, s5, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 +; GFX9-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9-NEXT: s_ashr_i32 s2, s3, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 +; GFX9-NEXT: s_ashr_i32 s0, s1, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v6 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3269,90 +3270,90 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: srem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s8, s6 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX6-NEXT: s_sext_i32_i16 s9, s4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX6-NEXT: s_xor_b32 s8, s9, s8 +; GFX6-NEXT: s_sext_i32_i16 s4, s10 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: s_sext_i32_i16 s5, s8 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s8, s8, 30 -; GFX6-NEXT: s_or_b32 s10, s8, 1 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX6-NEXT: s_cselect_b32 s8, s10, 0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s8, v2 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX6-NEXT: s_ashr_i32 s8, s6, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GFX6-NEXT: s_lshr_b32 s10, s4, 16 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GFX6-NEXT: s_ashr_i32 s4, s10, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: s_ashr_i32 s5, s8, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GFX6-NEXT: s_xor_b32 s4, s4, s8 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_lshr_b32 s6, s6, 16 +; GFX6-NEXT: s_lshr_b32 s6, s8, 16 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: s_or_b32 s4, s4, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v2|, |v1| -; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: s_lshr_b32 s7, s10, 16 +; GFX6-NEXT: s_or_b32 s8, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s8, 0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s7 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 +; GFX6-NEXT: s_sext_i32_i16 s4, s11 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: s_sext_i32_i16 s6, s5 -; GFX6-NEXT: s_xor_b32 s4, s6, s4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v1 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 +; GFX6-NEXT: s_sext_i32_i16 s5, s9 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v2| ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX6-NEXT: s_cselect_b32 s4, s4, 0 -; GFX6-NEXT: s_ashr_i32 s6, s7, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v2| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX6-NEXT: s_lshr_b32 s8, s7, 16 -; GFX6-NEXT: s_ashr_i32 s7, s5, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s7 +; GFX6-NEXT: s_ashr_i32 s4, s11, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: s_ashr_i32 s5, s9, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: s_xor_b32 s6, s7, s6 -; GFX6-NEXT: s_ashr_i32 s6, s6, 30 -; GFX6-NEXT: s_lshr_b32 s4, s5, 16 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_lshr_b32 s6, s9, 16 +; GFX6-NEXT: s_lshr_b32 s7, s11, 16 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: s_or_b32 s9, s6, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, |v2| -; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX6-NEXT: s_cselect_b32 s6, s9, 0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s6, v5 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s8 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: s_or_b32 s8, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s8, 0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v5 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -3361,87 +3362,87 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: srem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s8, s6 +; GFX9-NEXT: s_sext_i32_i16 s8, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s4 +; GFX9-NEXT: s_sext_i32_i16 s9, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX9-NEXT: s_xor_b32 s2, s9, s8 +; GFX9-NEXT: s_xor_b32 s4, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s10, s2, 1 +; GFX9-NEXT: s_ashr_i32 s4, s4, 30 +; GFX9-NEXT: s_or_b32 s10, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s10, 0 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s10, 0 +; GFX9-NEXT: s_ashr_i32 s10, s0, 16 +; GFX9-NEXT: s_ashr_i32 s0, s2, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_add_u32_e32 v1, s2, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s2, s4, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_xor_b32 s2, s10, s0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: v_add_u32_e32 v1, s4, v3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s10 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX9-NEXT: s_sext_i32_i16 s8, s1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: s_or_b32 s8, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v0| ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: s_sext_i32_i16 s8, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v4 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX9-NEXT: s_sext_i32_i16 s6, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 +; GFX9-NEXT: s_sext_i32_i16 s2, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s8 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX9-NEXT: s_xor_b32 s0, s8, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_xor_b32 s2, s6, s8 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s10, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s0, s0, 1 +; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s10, 0 -; GFX9-NEXT: s_ashr_i32 s7, s7, 16 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3| ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s7 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s5 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9-NEXT: s_ashr_i32 s3, s3, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 +; GFX9-NEXT: s_ashr_i32 s2, s1, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_xor_b32 s2, s5, s7 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s8 +; GFX9-NEXT: s_xor_b32 s0, s2, s3 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: s_or_b32 s8, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s2, v6 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s7 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s3 ; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 -; GFX9-NEXT: v_sub_u32_e32 v1, s6, v3 +; GFX9-NEXT: v_sub_u32_e32 v1, s8, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s5, v4 +; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm %r = srem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3472,14 +3473,14 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: udiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 +; GFX6-NEXT: s_bfe_u32 s2, s6, 0x30008 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX6-NEXT: s_and_b32 s4, s4, 7 +; GFX6-NEXT: s_and_b32 s4, s6, 7 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 @@ -3494,14 +3495,14 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: udiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s2, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: s_and_b32 s2, s4, 7 +; GFX9-NEXT: s_and_b32 s2, s2, 7 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -3543,15 +3544,15 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: urem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 +; GFX6-NEXT: s_bfe_u32 s2, s6, 0x30008 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX6-NEXT: s_and_b32 s3, s4, 7 +; GFX6-NEXT: s_and_b32 s3, s6, 7 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 -; GFX6-NEXT: s_lshr_b32 s2, s4, 8 +; GFX6-NEXT: s_lshr_b32 s2, s6, 8 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 @@ -3561,21 +3562,21 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 +; GFX9-NEXT: s_bfe_u32 s0, s2, 0x30008 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: s_and_b32 s1, s4, 7 +; GFX9-NEXT: s_and_b32 s1, s2, 7 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: s_lshr_b32 s0, s2, 8 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 @@ -3584,8 +3585,8 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] @@ -3623,16 +3624,16 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: sdiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s5, s4, 0x30008 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: s_bfe_i32 s4, s4, 0x30000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_bfe_i32 s4, s6, 0x30008 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0x30000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 @@ -3650,15 +3651,15 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: sdiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_i32 s3, s2, 0x30008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: s_bfe_i32 s2, s2, 0x30000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s4, s2, 1 @@ -3708,46 +3709,46 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: srem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s2, s4, 0x30008 +; GFX6-NEXT: s_bfe_i32 s2, s6, 0x30008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX6-NEXT: s_bfe_i32 s3, s4, 0x30000 +; GFX6-NEXT: s_bfe_i32 s3, s6, 0x30000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 ; GFX6-NEXT: s_xor_b32 s2, s3, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s2, s2, 30 -; GFX6-NEXT: s_lshr_b32 s5, s4, 8 -; GFX6-NEXT: s_or_b32 s6, s2, 1 +; GFX6-NEXT: s_lshr_b32 s4, s6, 8 +; GFX6-NEXT: s_or_b32 s5, s2, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| ; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX6-NEXT: s_cselect_b32 s2, s6, 0 +; GFX6-NEXT: s_cselect_b32 s2, s5, 0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0x30008 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 +; GFX9-NEXT: s_bfe_i32 s1, s2, 0x30000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_lshr_b32 s5, s4, 8 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 ; GFX9-NEXT: s_or_b32 s6, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -3757,10 +3758,10 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s6, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] @@ -3837,19 +3838,19 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: udiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s6, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_lshr_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s8, s4, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_lshr_b32 s5, s10, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_lshr_b32 s4, s4, 16 +; GFX6-NEXT: s_lshr_b32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -3858,11 +3859,11 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: s_and_b32 s4, s11, 0xffff ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: s_and_b32 s4, s5, 0xffff +; GFX6-NEXT: s_and_b32 s4, s9, 0xffff ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 @@ -3884,39 +3885,39 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s0, s4, 0xffff -; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX9-NEXT: s_and_b32 s7, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_and_b32 s6, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s2, s7, 0xffff +; GFX9-NEXT: s_and_b32 s0, s3, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s2, s5, 0xffff +; GFX9-NEXT: s_and_b32 s0, s1, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 @@ -3924,8 +3925,8 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v6, v0, s[0:1] +; GFX9-NEXT: global_store_short v6, v2, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v6, v0, s[6:7] ; GFX9-NEXT: s_endpgm %r = udiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4005,52 +4006,52 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: urem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s6, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_and_b32 s8, s4, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX6-NEXT: s_lshr_b32 s9, s6, 16 +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX6-NEXT: s_lshr_b32 s5, s10, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8 +; GFX6-NEXT: s_lshr_b32 s4, s8, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: s_and_b32 s6, s11, 0xffff ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: s_and_b32 s4, s7, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: s_and_b32 s4, s5, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 +; GFX6-NEXT: s_and_b32 s6, s9, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s8, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4058,20 +4059,20 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s6, 0xffff -; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: s_and_b32 s9, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_and_b32 s8, s4, 0xffff -; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_and_b32 s8, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_and_b32 s2, s7, 0xffff +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 @@ -4080,11 +4081,11 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 -; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc @@ -4095,16 +4096,16 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 ; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 +; GFX9-NEXT: v_sub_u32_e32 v1, s0, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s1, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v3, v0, s[6:7] ; GFX9-NEXT: s_endpgm %r = urem <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4190,49 +4191,49 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s8, s6 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX6-NEXT: s_sext_i32_i16 s9, s4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX6-NEXT: s_xor_b32 s8, s9, s8 +; GFX6-NEXT: s_sext_i32_i16 s4, s10 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: s_sext_i32_i16 s5, s8 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s8, s8, 30 -; GFX6-NEXT: s_or_b32 s10, s8, 1 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX6-NEXT: s_cselect_b32 s8, s10, 0 -; GFX6-NEXT: s_ashr_i32 s6, s6, 16 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, s8, v2 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_ashr_i32 s5, s10, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v2 +; GFX6-NEXT: s_ashr_i32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s6 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_sext_i32_i16 s6, s7 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: s_or_b32 s4, s4, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v2|, |v0| -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: s_sext_i32_i16 s5, s11 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s5 +; GFX6-NEXT: s_sext_i32_i16 s4, s9 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s6 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -4252,62 +4253,62 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: sdiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX9-NEXT: s_sext_i32_i16 s5, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX9-NEXT: s_xor_b32 s4, s5, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s8, s2, 1 +; GFX9-NEXT: s_ashr_i32 s4, s4, 30 +; GFX9-NEXT: s_or_b32 s8, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: s_ashr_i32 s3, s6, 16 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s8, 0 +; GFX9-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_add_u32_e32 v2, s2, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: v_add_u32_e32 v2, s4, v3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s2, s4, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_sext_i32_i16 s2, s3 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s3, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v4 -; GFX9-NEXT: s_sext_i32_i16 s2, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2 +; GFX9-NEXT: s_or_b32 s0, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 +; GFX9-NEXT: s_sext_i32_i16 s0, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: global_store_short v1, v0, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v1, v2, s[0:1] +; GFX9-NEXT: global_store_short v1, v0, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v1, v2, s[6:7] ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4399,68 +4400,68 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: srem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s8, s6 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX6-NEXT: s_sext_i32_i16 s9, s4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX6-NEXT: s_xor_b32 s8, s9, s8 +; GFX6-NEXT: s_sext_i32_i16 s4, s10 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: s_sext_i32_i16 s5, s8 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s8, s8, 30 -; GFX6-NEXT: s_or_b32 s10, s8, 1 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX6-NEXT: s_cselect_b32 s8, s10, 0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s8, v2 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX6-NEXT: s_ashr_i32 s8, s6, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GFX6-NEXT: s_lshr_b32 s10, s4, 16 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GFX6-NEXT: s_ashr_i32 s4, s10, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: s_ashr_i32 s5, s8, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GFX6-NEXT: s_xor_b32 s4, s4, s8 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_lshr_b32 s6, s6, 16 +; GFX6-NEXT: s_lshr_b32 s6, s8, 16 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: s_or_b32 s4, s4, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v2|, |v1| -; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: s_lshr_b32 s7, s10, 16 +; GFX6-NEXT: s_or_b32 s8, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s8, 0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s7 +; GFX6-NEXT: s_sext_i32_i16 s4, s11 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX6-NEXT: s_sext_i32_i16 s6, s5 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s6 +; GFX6-NEXT: s_sext_i32_i16 s5, s9 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s4, s6, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 +; GFX6-NEXT: s_or_b32 s7, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v3|, |v2| -; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s7, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v4 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s10, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 @@ -4469,68 +4470,68 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: srem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s8, s6 +; GFX9-NEXT: s_sext_i32_i16 s8, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s4 +; GFX9-NEXT: s_sext_i32_i16 s9, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX9-NEXT: s_xor_b32 s2, s9, s8 +; GFX9-NEXT: s_xor_b32 s4, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s10, s2, 1 -; GFX9-NEXT: s_sext_i32_i16 s7, s7 +; GFX9-NEXT: s_ashr_i32 s4, s4, 30 +; GFX9-NEXT: s_or_b32 s10, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s10, 0 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s10, 0 +; GFX9-NEXT: s_ashr_i32 s10, s0, 16 +; GFX9-NEXT: s_ashr_i32 s0, s2, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: v_add_u32_e32 v1, s2, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_xor_b32 s2, s4, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_xor_b32 s2, s10, s0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: v_add_u32_e32 v1, s4, v2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s10 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX9-NEXT: s_or_b32 s8, s2, 1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s7 -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 +; GFX9-NEXT: s_sext_i32_i16 s2, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 +; GFX9-NEXT: s_sext_i32_i16 s3, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s3 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX9-NEXT: s_xor_b32 s2, s5, s7 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 +; GFX9-NEXT: s_xor_b32 s0, s3, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_or_b32 s6, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 +; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 +; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v3, v0, s[6:7] ; GFX9-NEXT: s_endpgm %r = srem <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4604,23 +4605,23 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: udiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s2, s10, 0x7fff -; GFX6-NEXT: s_and_b32 s3, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX6-NEXT: s_and_b32 s6, s10, 0x7fff +; GFX6-NEXT: s_and_b32 s7, s4, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX6-NEXT: s_bfe_u32 s3, s10, 0xf000f -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 +; GFX6-NEXT: s_bfe_u32 s7, s10, 0xf000f +; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 @@ -4651,41 +4652,41 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 +; GFX6-NEXT: s_mov_b32 s0, s8 +; GFX6-NEXT: s_mov_b32 s1, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff -; GFX9-NEXT: s_and_b32 s3, s0, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_and_b32 s5, s6, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s4, s2, 0x7fff +; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 ; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f -; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf000f +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -4709,9 +4710,9 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 +; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4 ; GFX9-NEXT: s_endpgm %r = udiv <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out @@ -4791,21 +4792,21 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: urem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_and_b32 s8, s0, 0x7fff +; GFX6-NEXT: s_mov_b32 s0, s8 +; GFX6-NEXT: s_and_b32 s8, s4, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX6-NEXT: s_and_b32 s3, s10, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_and_b32 s7, s10, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 -; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 30 +; GFX6-NEXT: s_bfe_u32 s5, s4, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 @@ -4814,7 +4815,7 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_bfe_u32 s8, s10, 0xf000f ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, s10 @@ -4834,52 +4835,52 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 -; GFX6-NEXT: s_lshr_b32 s0, s0, 15 +; GFX6-NEXT: s_lshr_b32 s4, s4, 15 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: s_lshr_b32 s2, s10, 15 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v1 +; GFX6-NEXT: s_lshr_b32 s6, s10, 15 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s5, s9 +; GFX6-NEXT: s_mov_b32 s1, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 -; GFX9-NEXT: s_bfe_u32 s2, s0, 0xf000f +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX9-NEXT: s_and_b32 s3, s6, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: s_and_b32 s4, s2, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX9-NEXT: s_bfe_u32 s4, s6, 0xf000f ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f +; GFX9-NEXT: s_bfe_u32 s5, s2, 0xf000f ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 @@ -4896,15 +4897,15 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 -; GFX9-NEXT: s_lshr_b32 s1, s0, 15 +; GFX9-NEXT: s_lshr_b32 s3, s6, 15 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s1 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s3 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 ; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 -; GFX9-NEXT: s_lshr_b32 s0, s6, 15 -; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, s6, v1 +; GFX9-NEXT: s_lshr_b32 s3, s2, 15 +; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4 +; GFX9-NEXT: v_sub_u32_e32 v5, s2, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] @@ -4912,9 +4913,9 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 +; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4 ; GFX9-NEXT: s_endpgm %r = urem <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out @@ -5000,50 +5001,50 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: s_bfe_i32 s2, s0, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GFX6-NEXT: s_bfe_i32 s6, s4, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 30 +; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s1, s1, s2 -; GFX6-NEXT: s_ashr_i32 s1, s1, 30 -; GFX6-NEXT: s_or_b32 s1, s1, 1 +; GFX6-NEXT: s_xor_b32 s5, s5, s6 +; GFX6-NEXT: s_ashr_i32 s5, s5, 30 +; GFX6-NEXT: s_or_b32 s5, s5, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| -; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v2| +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: s_cselect_b32 s1, s1, 0 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, s1, v4 -; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX6-NEXT: s_cselect_b32 s5, s5, 0 +; GFX6-NEXT: s_bfe_i32 s4, s4, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s5, v4 +; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 -; GFX6-NEXT: s_or_b32 s2, s0, 1 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v2| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1 ; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX6-NEXT: s_cselect_b32 s0, s2, 0 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v5 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -5061,60 +5062,60 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 +; GFX6-NEXT: s_mov_b32 s0, s8 +; GFX6-NEXT: s_mov_b32 s1, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_bfe_i32 s2, s0, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_bfe_i32 s4, s6, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX9-NEXT: s_bfe_i32 s3, s2, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_xor_b32 s1, s1, s2 -; GFX9-NEXT: s_ashr_i32 s1, s1, 30 -; GFX9-NEXT: s_or_b32 s1, s1, 1 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_ashr_i32 s3, s3, 30 +; GFX9-NEXT: s_or_b32 s3, s3, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3| +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_bfe_i32 s4, s6, 0xf000f ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s1, s1, 0 -; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX9-NEXT: v_add_u32_e32 v4, s1, v5 -; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX9-NEXT: s_bfe_i32 s2, s2, 0xf000f +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_add_u32_e32 v4, s3, v5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 -; GFX9-NEXT: s_or_b32 s2, s0, 1 +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v3| ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s2, 0 +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 +; GFX9-NEXT: v_add_u32_e32 v5, s2, v6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -5133,9 +5134,9 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 +; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4 ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out @@ -5227,57 +5228,57 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: srem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 -; GFX6-NEXT: s_bfe_i32 s1, s0, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s2 -; GFX6-NEXT: s_xor_b32 s1, s2, s1 -; GFX6-NEXT: s_ashr_i32 s1, s1, 30 +; GFX6-NEXT: s_bfe_i32 s6, s10, 0xf0000 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 30 +; GFX6-NEXT: s_bfe_i32 s5, s4, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s6 +; GFX6-NEXT: s_xor_b32 s5, s6, s5 +; GFX6-NEXT: s_ashr_i32 s5, s5, 30 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 +; GFX6-NEXT: s_mov_b32 s0, s8 +; GFX6-NEXT: s_mov_b32 s1, s9 ; GFX6-NEXT: s_lshr_b32 s8, s10, 15 ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v6, v6 ; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX6-NEXT: s_lshr_b32 s9, s0, 15 -; GFX6-NEXT: s_or_b32 s1, s1, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX6-NEXT: s_cselect_b32 s1, s1, 0 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s1, v6 -; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 -; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s1 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 +; GFX6-NEXT: s_lshr_b32 s9, s4, 15 +; GFX6-NEXT: s_or_b32 s5, s5, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v5|, |v4| +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX6-NEXT: s_cselect_b32 s5, s5, 0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s5, v6 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, s4 +; GFX6-NEXT: s_bfe_i32 s4, s4, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s4 +; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 ; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v7, v7 ; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 -; GFX6-NEXT: s_or_b32 s2, s0, 1 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX6-NEXT: s_cselect_b32 s0, s2, 0 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, s0, v7 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, s4, v7 ; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 @@ -5301,60 +5302,59 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s6, 0xf0000 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 -; GFX9-NEXT: s_xor_b32 s1, s2, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX9-NEXT: s_bfe_i32 s3, s6, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3 +; GFX9-NEXT: s_bfe_i32 s4, s2, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s4 +; GFX9-NEXT: s_xor_b32 s3, s4, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_ashr_i32 s1, s1, 30 -; GFX9-NEXT: s_lshr_b32 s8, s6, 15 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_ashr_i32 s3, s3, 30 +; GFX9-NEXT: s_lshr_b32 s8, s2, 15 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: s_lshr_b32 s7, s0, 15 -; GFX9-NEXT: s_or_b32 s1, s1, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s1, s1, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s1, v6 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s0 +; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 +; GFX9-NEXT: s_lshr_b32 s7, s6, 15 +; GFX9-NEXT: s_or_b32 s3, s3, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s3, v6 +; GFX9-NEXT: s_bfe_i32 s3, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s3 +; GFX9-NEXT: s_bfe_i32 s4, s2, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s4 +; GFX9-NEXT: s_xor_b32 s3, s4, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_ashr_i32 s3, s3, 30 +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 ; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 -; GFX9-NEXT: s_or_b32 s2, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| +; GFX9-NEXT: s_or_b32 s3, s3, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v5, s0, v7 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: v_add_u32_e32 v5, s3, v7 ; GFX9-NEXT: v_bfe_i32 v7, v0, 0, 15 ; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v6 @@ -5367,11 +5367,12 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 ; GFX9-NEXT: v_mul_lo_u32 v5, v5, s7 ; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v4 +; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 ; GFX9-NEXT: v_sub_u32_e32 v4, s8, v5 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 @@ -5380,9 +5381,9 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 +; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4 ; GFX9-NEXT: s_endpgm %r = srem <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out @@ -5397,14 +5398,14 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: udiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 @@ -5413,15 +5414,15 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: udiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881 -; GFX9-NEXT: s_sub_i32 s3, s4, s2 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 20 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xb2a50881 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, 20 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -5438,23 +5439,23 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: udiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s4, s4, 12 +; GFX6-NEXT: s_lshr_b32 s4, s6, 12 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s4, 12 +; GFX9-NEXT: s_lshr_b32 s2, s2, 12 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -5472,7 +5473,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: udiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5486,13 +5487,13 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s7, 12 -; GFX9-NEXT: s_lshr_b32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_add_i32 s3, s3, 12 +; GFX9-NEXT: s_lshr_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = udiv i32 %x, %shl.y @@ -5513,7 +5514,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: udiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5528,14 +5529,14 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s6, 12 -; GFX9-NEXT: s_lshr_b32 s1, s7, 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_lshr_b32 s2, s2, 12 +; GFX9-NEXT: s_lshr_b32 s3, s3, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -5555,7 +5556,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -5574,18 +5575,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s1, s7, 0x100101 -; GFX9-NEXT: s_sub_i32 s2, s7, s1 -; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: s_add_i32 s2, s2, s1 -; GFX9-NEXT: s_lshr_b32 s0, s6, 12 -; GFX9-NEXT: s_lshr_b32 s1, s2, 11 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s2, s2, 12 +; GFX9-NEXT: s_lshr_b32 s3, s3, 11 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -5664,16 +5665,16 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX6-NEXT: s_sub_i32 s1, 0, s0 -; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s11 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -5681,94 +5682,95 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 ; GFX6-NEXT: s_mul_i32 s1, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s8, s1 +; GFX6-NEXT: s_sub_i32 s3, s1, s0 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_cselect_b32 s1, s3, s1 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX6-NEXT: s_sub_i32 s4, 0, s6 -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1 -; GFX6-NEXT: s_mul_i32 s0, s0, s6 -; GFX6-NEXT: s_sub_i32 s0, s5, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s6 +; GFX6-NEXT: s_mul_i32 s0, s0, s2 +; GFX6-NEXT: s_sub_i32 s0, s9, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GFX6-NEXT: s_cmp_ge_u32 s0, s6 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: s_cselect_b32 s0, s1, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GFX6-NEXT: s_cmp_ge_u32 s0, s6 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s4, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 -; GFX9-NEXT: s_mul_i32 s3, s2, s6 -; GFX9-NEXT: s_sub_i32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s9, s2, 1 -; GFX9-NEXT: s_sub_i32 s4, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s2, s9, s2 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s4, s2, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5 +; GFX9-NEXT: s_mul_i32 s5, s4, s7 +; GFX9-NEXT: s_sub_i32 s0, s0, s5 +; GFX9-NEXT: s_add_i32 s9, s4, 1 +; GFX9-NEXT: s_sub_i32 s5, s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 +; GFX9-NEXT: s_cselect_b32 s0, s5, s0 +; GFX9-NEXT: s_add_i32 s5, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s7 -; GFX9-NEXT: s_mul_i32 s3, s3, s8 -; GFX9-NEXT: s_mul_hi_u32 s3, s8, s3 -; GFX9-NEXT: s_add_i32 s8, s8, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s8 -; GFX9-NEXT: s_mul_i32 s4, s3, s7 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s6, s3, 1 -; GFX9-NEXT: s_sub_i32 s5, s4, s7 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s3, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_cselect_b32 s0, s5, s4 +; GFX9-NEXT: s_sub_i32 s4, 0, s6 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8 +; GFX9-NEXT: s_mul_i32 s5, s4, s6 +; GFX9-NEXT: s_sub_i32 s1, s1, s5 +; GFX9-NEXT: s_add_i32 s7, s4, 1 +; GFX9-NEXT: s_sub_i32 s5, s1, s6 +; GFX9-NEXT: s_cmp_ge_u32 s1, s6 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_cselect_b32 s1, s5, s1 +; GFX9-NEXT: s_add_i32 s5, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s1, s6 +; GFX9-NEXT: s_cselect_b32 s1, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = udiv <2 x i32> %x, %shl.y @@ -5784,36 +5786,36 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: urem_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 ; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881 -; GFX9-NEXT: s_sub_i32 s3, s4, s2 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 20 -; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb -; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xb2a50881 +; GFX9-NEXT: s_sub_i32 s4, s2, s3 +; GFX9-NEXT: s_lshr_b32 s4, s4, 1 +; GFX9-NEXT: s_add_i32 s4, s4, s3 +; GFX9-NEXT: s_lshr_b32 s3, s4, 20 +; GFX9-NEXT: s_mul_i32 s3, s3, 0x12d8fb +; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -5830,23 +5832,23 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: urem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s4, s4, 0xfff +; GFX6-NEXT: s_and_b32 s4, s6, 0xfff ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0xfff +; GFX9-NEXT: s_and_b32 s2, s2, 0xfff ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -5864,7 +5866,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: urem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5879,14 +5881,14 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 -; GFX9-NEXT: s_add_i32 s0, s0, -1 -; GFX9-NEXT: s_and_b32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX9-NEXT: s_add_i32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = urem i32 %x, %shl.y @@ -5907,7 +5909,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: urem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5922,14 +5924,14 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s6, 0xfff -; GFX9-NEXT: s_and_b32 s1, s7, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_and_b32 s2, s2, 0xfff +; GFX9-NEXT: s_and_b32 s3, s3, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = urem <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6004,102 +6006,102 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 -; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s6, 0, s2 +; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s4, s4, s1 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 -; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_readfirstlane_b32 s6, v0 +; GFX6-NEXT: s_mul_i32 s6, s6, s2 +; GFX6-NEXT: s_sub_i32 s0, s0, s6 +; GFX6-NEXT: s_sub_i32 s6, s0, s2 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s6, s0 +; GFX6-NEXT: s_sub_i32 s6, s0, s2 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s6, s0 +; GFX6-NEXT: s_sub_i32 s2, 0, s3 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s6 -; GFX6-NEXT: s_sub_i32 s5, s5, s7 -; GFX6-NEXT: s_sub_i32 s7, s5, s6 -; GFX6-NEXT: s_cmp_ge_u32 s5, s6 -; GFX6-NEXT: s_cselect_b32 s5, s7, s5 -; GFX6-NEXT: s_sub_i32 s7, s5, s6 -; GFX6-NEXT: s_cmp_ge_u32 s5, s6 -; GFX6-NEXT: s_cselect_b32 s5, s7, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: s_sub_i32 s2, s1, s3 +; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_sub_i32 s2, s1, s3 +; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s4, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 -; GFX9-NEXT: s_mul_i32 s2, s2, s6 -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s6 -; GFX9-NEXT: s_cmp_ge_u32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s6 -; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s7 +; GFX9-NEXT: s_sub_i32 s0, s0, s4 +; GFX9-NEXT: s_sub_i32 s4, s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_cselect_b32 s0, s4, s0 +; GFX9-NEXT: s_sub_i32 s4, s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s7 -; GFX9-NEXT: s_mul_i32 s3, s3, s8 -; GFX9-NEXT: s_mul_hi_u32 s3, s8, s3 -; GFX9-NEXT: s_add_i32 s8, s8, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s8 -; GFX9-NEXT: s_mul_i32 s3, s3, s7 -; GFX9-NEXT: s_sub_i32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s4, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_sub_i32 s4, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_cselect_b32 s0, s4, s0 +; GFX9-NEXT: s_sub_i32 s4, 0, s6 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8 +; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_sub_i32 s1, s1, s4 +; GFX9-NEXT: s_sub_i32 s4, s1, s6 +; GFX9-NEXT: s_cmp_ge_u32 s1, s6 +; GFX9-NEXT: s_cselect_b32 s1, s4, s1 +; GFX9-NEXT: s_sub_i32 s4, s1, s6 +; GFX9-NEXT: s_cmp_ge_u32 s1, s6 +; GFX9-NEXT: s_cselect_b32 s1, s4, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = urem <2 x i32> %x, %shl.y @@ -6115,14 +6117,14 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: sdiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_mul_hi_i32 v0, s6, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -6131,15 +6133,15 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: sdiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s2, s4, 0xd9528441 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 31 -; GFX9-NEXT: s_ashr_i32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_hi_i32 s3, s2, 0xd9528441 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_lshr_b32 s2, s3, 31 +; GFX9-NEXT: s_ashr_i32 s3, s3, 20 +; GFX9-NEXT: s_add_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -6156,29 +6158,29 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: sdiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s5, s4, 31 -; GFX6-NEXT: s_lshr_b32 s5, s5, 20 -; GFX6-NEXT: s_add_i32 s4, s4, s5 -; GFX6-NEXT: s_ashr_i32 s4, s4, 12 +; GFX6-NEXT: s_ashr_i32 s4, s6, 31 +; GFX6-NEXT: s_lshr_b32 s4, s4, 20 +; GFX6-NEXT: s_add_i32 s6, s6, s4 +; GFX6-NEXT: s_ashr_i32 s4, s6, 12 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s4, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s4, s4, s2 -; GFX9-NEXT: s_ashr_i32 s2, s4, 12 +; GFX9-NEXT: s_ashr_i32 s3, s2, 31 +; GFX9-NEXT: s_lshr_b32 s3, s3, 20 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 12 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -6196,7 +6198,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: sdiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6239,41 +6241,41 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 -; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_ashr_i32 s2, s6, 31 -; GFX9-NEXT: s_add_i32 s3, s6, s2 -; GFX9-NEXT: s_sub_i32 s6, 0, s0 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 +; GFX9-NEXT: s_ashr_i32 s5, s2, 31 +; GFX9-NEXT: s_add_i32 s2, s2, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_xor_b32 s2, s2, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: s_mul_i32 s6, s6, s7 ; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 ; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s3, s7 -; GFX9-NEXT: s_mul_i32 s8, s6, s0 -; GFX9-NEXT: s_sub_i32 s3, s3, s8 +; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7 +; GFX9-NEXT: s_mul_i32 s8, s6, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s8 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_sub_i32 s8, s3, s0 -; GFX9-NEXT: s_cmp_ge_u32 s3, s0 +; GFX9-NEXT: s_sub_i32 s8, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 ; GFX9-NEXT: s_cselect_b32 s6, s7, s6 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 +; GFX9-NEXT: s_cselect_b32 s2, s8, s2 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s0 -; GFX9-NEXT: s_cselect_b32 s0, s7, s6 -; GFX9-NEXT: s_xor_b32 s1, s2, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s7, s6 +; GFX9-NEXT: s_xor_b32 s3, s5, s4 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = sdiv i32 %x, %shl.y @@ -6294,7 +6296,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: sdiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6315,20 +6317,20 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s6, 31 -; GFX9-NEXT: s_ashr_i32 s1, s7, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_lshr_b32 s1, s1, 20 -; GFX9-NEXT: s_add_i32 s0, s6, s0 -; GFX9-NEXT: s_add_i32 s1, s7, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 12 -; GFX9-NEXT: s_ashr_i32 s1, s1, 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_lshr_b32 s5, s5, 20 +; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_add_i32 s3, s3, s5 +; GFX9-NEXT: s_ashr_i32 s2, s2, 12 +; GFX9-NEXT: s_ashr_i32 s3, s3, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6348,7 +6350,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -6370,21 +6372,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s6, 31 -; GFX9-NEXT: s_mul_hi_i32 s1, s7, 0x80080081 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_i32 s1, s1, s7 -; GFX9-NEXT: s_add_i32 s0, s6, s0 -; GFX9-NEXT: s_lshr_b32 s2, s1, 31 -; GFX9-NEXT: s_ashr_i32 s1, s1, 11 -; GFX9-NEXT: s_ashr_i32 s0, s0, 12 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_mul_hi_i32 s5, s3, 0x80080081 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_add_i32 s5, s5, s3 +; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s3, s5, 31 +; GFX9-NEXT: s_ashr_i32 s4, s5, 11 +; GFX9-NEXT: s_ashr_i32 s2, s2, 12 +; GFX9-NEXT: s_add_i32 s4, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6481,137 +6483,136 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s1, s0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX6-NEXT: s_sub_i32 s6, 0, s1 -; GFX6-NEXT: s_xor_b32 s0, s4, s0 -; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 +; GFX6-NEXT: s_abs_i32 s6, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_sub_i32 s7, 0, s6 +; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 -; GFX6-NEXT: s_abs_i32 s6, s4 -; GFX6-NEXT: s_ashr_i32 s4, s0, 31 +; GFX6-NEXT: v_mul_lo_u32 v1, s7, v0 +; GFX6-NEXT: s_abs_i32 s7, s0 +; GFX6-NEXT: s_xor_b32 s0, s0, s2 +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s1 -; GFX6-NEXT: s_sub_i32 s0, s6, s0 -; GFX6-NEXT: s_sub_i32 s6, s0, s1 +; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s6 +; GFX6-NEXT: s_sub_i32 s2, s7, s2 +; GFX6-NEXT: s_sub_i32 s7, s2, s6 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s1 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s0, s6, s0 +; GFX6-NEXT: s_cselect_b32 s2, s7, s2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s1 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_abs_i32 s6, s7 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: s_abs_i32 s2, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX6-NEXT: s_sub_i32 s6, 0, s2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: s_xor_b32 s3, s1, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_xor_b32 s7, s5, s7 -; GFX6-NEXT: s_abs_i32 s5, s5 -; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: s_abs_i32 s1, s1 +; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: s_ashr_i32 s7, s7, 31 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v2 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_ashr_i32 s3, s3, 31 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v3, s6, v2 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_readfirstlane_b32 s4, v1 -; GFX6-NEXT: s_mul_i32 s4, s4, s6 -; GFX6-NEXT: s_sub_i32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s6 +; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: s_mul_i32 s0, s0, s2 +; GFX6-NEXT: s_sub_i32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GFX6-NEXT: s_cmp_ge_u32 s4, s6 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GFX6-NEXT: s_cmp_ge_u32 s4, s6 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, s7, v1 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s7, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s3, v1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s1, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_xor_b32 s0, s4, s0 -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 -; GFX9-NEXT: s_abs_i32 s7, s4 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2 +; GFX9-NEXT: s_abs_i32 s6, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s3 +; GFX9-NEXT: s_abs_i32 s3, s0 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s4, s0, 31 -; GFX9-NEXT: s_sub_i32 s0, 0, s1 +; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: s_ashr_i32 s0, s0, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s8 -; GFX9-NEXT: s_mul_hi_u32 s0, s8, s0 -; GFX9-NEXT: s_add_i32 s8, s8, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s7, s8 -; GFX9-NEXT: s_mul_i32 s8, s0, s1 -; GFX9-NEXT: s_sub_i32 s7, s7, s8 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_sub_i32 s8, s7, s1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s1 -; GFX9-NEXT: s_cselect_b32 s0, s9, s0 -; GFX9-NEXT: s_cselect_b32 s7, s8, s7 -; GFX9-NEXT: s_add_i32 s8, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s1 -; GFX9-NEXT: s_cselect_b32 s7, s8, s0 -; GFX9-NEXT: s_abs_i32 s8, s6 +; GFX9-NEXT: s_mul_i32 s2, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s2, s8, s2 +; GFX9-NEXT: s_add_i32 s8, s8, s2 +; GFX9-NEXT: s_mul_hi_u32 s2, s3, s8 +; GFX9-NEXT: s_mul_i32 s8, s2, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 +; GFX9-NEXT: s_add_i32 s9, s2, 1 +; GFX9-NEXT: s_sub_i32 s8, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s2, s9, s2 +; GFX9-NEXT: s_cselect_b32 s3, s8, s3 +; GFX9-NEXT: s_add_i32 s8, s2, 1 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s6, s8, s2 +; GFX9-NEXT: s_abs_i32 s8, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_xor_b32 s2, s5, s6 -; GFX9-NEXT: s_abs_i32 s3, s5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s7, s4 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_xor_b32 s5, s6, s0 ; GFX9-NEXT: s_sub_i32 s6, 0, s8 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s0, s5, s0 +; GFX9-NEXT: s_xor_b32 s4, s1, s7 +; GFX9-NEXT: s_abs_i32 s1, s1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 31 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 ; GFX9-NEXT: s_mul_i32 s6, s6, s5 ; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 ; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5 ; GFX9-NEXT: s_mul_i32 s6, s5, s8 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 +; GFX9-NEXT: s_sub_i32 s1, s1, s6 ; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_sub_i32 s6, s3, s8 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: s_sub_i32 s6, s1, s8 +; GFX9-NEXT: s_cmp_ge_u32 s1, s8 ; GFX9-NEXT: s_cselect_b32 s5, s7, s5 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 +; GFX9-NEXT: s_cselect_b32 s1, s6, s1 ; GFX9-NEXT: s_add_i32 s6, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s6, s5 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_cmp_ge_u32 s1, s8 +; GFX9-NEXT: s_cselect_b32 s1, s6, s5 +; GFX9-NEXT: s_xor_b32 s1, s1, s4 +; GFX9-NEXT: s_sub_i32 s1, s1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = sdiv <2 x i32> %x, %shl.y @@ -6627,37 +6628,37 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: srem_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 -; GFX6-NEXT: v_readfirstlane_b32 s5, v0 -; GFX6-NEXT: s_add_i32 s5, s5, s4 -; GFX6-NEXT: s_lshr_b32 s6, s5, 31 -; GFX6-NEXT: s_ashr_i32 s5, s5, 20 -; GFX6-NEXT: s_add_i32 s5, s5, s6 -; GFX6-NEXT: s_mul_i32 s5, s5, 0x12d8fb -; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: v_mul_hi_i32 v0, s6, v0 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_add_i32 s4, s4, s6 +; GFX6-NEXT: s_lshr_b32 s5, s4, 31 +; GFX6-NEXT: s_ashr_i32 s4, s4, 20 +; GFX6-NEXT: s_add_i32 s4, s4, s5 +; GFX6-NEXT: s_mul_i32 s4, s4, 0x12d8fb +; GFX6-NEXT: s_sub_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s2, s4, 0xd9528441 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 31 -; GFX9-NEXT: s_ashr_i32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb -; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: s_mul_hi_i32 s3, s2, 0xd9528441 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_lshr_b32 s4, s3, 31 +; GFX9-NEXT: s_ashr_i32 s3, s3, 20 +; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: s_mul_i32 s3, s3, 0x12d8fb +; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -6674,31 +6675,31 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: srem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s5, s4, 31 -; GFX6-NEXT: s_lshr_b32 s5, s5, 20 -; GFX6-NEXT: s_add_i32 s5, s4, s5 -; GFX6-NEXT: s_and_b32 s5, s5, 0xfffff000 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_ashr_i32 s4, s6, 31 +; GFX6-NEXT: s_lshr_b32 s4, s4, 20 +; GFX6-NEXT: s_add_i32 s4, s6, s4 +; GFX6-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GFX6-NEXT: s_sub_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s4, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s2, s2, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s2, s4, s2 +; GFX9-NEXT: s_ashr_i32 s3, s2, 31 +; GFX9-NEXT: s_lshr_b32 s3, s3, 20 +; GFX9-NEXT: s_add_i32 s3, s2, s3 +; GFX9-NEXT: s_and_b32 s3, s3, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -6716,7 +6717,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: srem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 @@ -6753,38 +6754,38 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 -; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_ashr_i32 s1, s6, 31 -; GFX9-NEXT: s_add_i32 s2, s6, s1 -; GFX9-NEXT: s_sub_i32 s3, 0, s0 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s5, 0, s3 +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_add_i32 s2, s2, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s1 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s3, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s6, s6, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 -; GFX9-NEXT: s_mul_i32 s3, s3, s0 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s3, s2, s0 -; GFX9-NEXT: s_cmp_ge_u32 s2, s0 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s0 -; GFX9-NEXT: s_cmp_ge_u32 s2, s0 -; GFX9-NEXT: s_cselect_b32 s0, s3, s2 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_mul_i32 s5, s5, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = srem i32 %x, %shl.y @@ -6805,7 +6806,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: srem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6828,22 +6829,22 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s6, 31 -; GFX9-NEXT: s_ashr_i32 s1, s7, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_lshr_b32 s1, s1, 20 -; GFX9-NEXT: s_add_i32 s0, s6, s0 -; GFX9-NEXT: s_add_i32 s1, s7, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 -; GFX9-NEXT: s_and_b32 s1, s1, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s0, s6, s0 -; GFX9-NEXT: s_sub_i32 s1, s7, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_lshr_b32 s5, s5, 20 +; GFX9-NEXT: s_add_i32 s4, s2, s4 +; GFX9-NEXT: s_add_i32 s5, s3, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6934,123 +6935,122 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s0, s0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 -; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 +; GFX6-NEXT: s_abs_i32 s2, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s6, 0, s2 +; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX6-NEXT: s_abs_i32 s1, s4 -; GFX6-NEXT: s_ashr_i32 s4, s4, 31 +; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX6-NEXT: s_abs_i32 s6, s0 +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s0 -; GFX6-NEXT: s_sub_i32 s1, s1, s7 -; GFX6-NEXT: s_sub_i32 s7, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s1, s7, s1 -; GFX6-NEXT: s_sub_i32 s7, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s7, s7, s1 -; GFX6-NEXT: s_abs_i32 s6, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 -; GFX6-NEXT: s_abs_i32 s8, s5 -; GFX6-NEXT: s_xor_b32 s7, s7, s4 +; GFX6-NEXT: s_mul_i32 s7, s7, s2 +; GFX6-NEXT: s_sub_i32 s6, s6, s7 +; GFX6-NEXT: s_sub_i32 s7, s6, s2 +; GFX6-NEXT: s_cmp_ge_u32 s6, s2 +; GFX6-NEXT: s_cselect_b32 s6, s7, s6 +; GFX6-NEXT: s_sub_i32 s7, s6, s2 +; GFX6-NEXT: s_cmp_ge_u32 s6, s2 +; GFX6-NEXT: s_cselect_b32 s2, s7, s6 +; GFX6-NEXT: s_abs_i32 s3, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s6, 0, s3 +; GFX6-NEXT: s_abs_i32 s8, s1 +; GFX6-NEXT: s_xor_b32 s2, s2, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_sub_i32 s4, s7, s4 -; GFX6-NEXT: s_ashr_i32 s5, s5, 31 +; GFX6-NEXT: s_sub_i32 s0, s2, s0 +; GFX6-NEXT: s_ashr_i32 s1, s1, 31 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s6 -; GFX6-NEXT: s_sub_i32 s7, s8, s7 -; GFX6-NEXT: s_sub_i32 s8, s7, s6 -; GFX6-NEXT: s_cmp_ge_u32 s7, s6 -; GFX6-NEXT: s_cselect_b32 s7, s8, s7 -; GFX6-NEXT: s_sub_i32 s8, s7, s6 -; GFX6-NEXT: s_cmp_ge_u32 s7, s6 -; GFX6-NEXT: s_cselect_b32 s6, s8, s7 -; GFX6-NEXT: s_xor_b32 s6, s6, s5 -; GFX6-NEXT: s_sub_i32 s5, s6, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_xor_b32 s2, s2, s1 +; GFX6-NEXT: s_sub_i32 s1, s2, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2 +; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s7, 0, s2 +; GFX9-NEXT: s_ashr_i32 s6, s0, 31 ; GFX9-NEXT: s_abs_i32 s0, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_lshl_b32 s1, 0x1000, s7 -; GFX9-NEXT: s_sub_i32 s7, 0, s0 -; GFX9-NEXT: s_ashr_i32 s6, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_abs_i32 s4, s4 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9-NEXT: s_mul_i32 s7, s7, s8 ; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 ; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 -; GFX9-NEXT: s_mul_i32 s7, s7, s0 -; GFX9-NEXT: s_sub_i32 s4, s4, s7 -; GFX9-NEXT: s_sub_i32 s7, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_sub_i32 s7, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_abs_i32 s7, s1 +; GFX9-NEXT: s_mul_hi_u32 s7, s0, s8 +; GFX9-NEXT: s_mul_i32 s7, s7, s2 +; GFX9-NEXT: s_sub_i32 s0, s0, s7 +; GFX9-NEXT: s_sub_i32 s7, s0, s2 +; GFX9-NEXT: s_cmp_ge_u32 s0, s2 +; GFX9-NEXT: s_cselect_b32 s0, s7, s0 +; GFX9-NEXT: s_sub_i32 s7, s0, s2 +; GFX9-NEXT: s_cmp_ge_u32 s0, s2 +; GFX9-NEXT: s_cselect_b32 s0, s7, s0 +; GFX9-NEXT: s_abs_i32 s7, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_abs_i32 s3, s5 +; GFX9-NEXT: s_xor_b32 s0, s0, s6 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: s_sub_i32 s5, 0, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s0, s0, s6 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_abs_i32 s1, s1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 ; GFX9-NEXT: s_mul_i32 s5, s5, s6 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 ; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s1, s6 ; GFX9-NEXT: s_mul_i32 s5, s5, s7 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_sub_i32 s5, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s5, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_sub_i32 s1, s1, s5 +; GFX9-NEXT: s_sub_i32 s5, s1, s7 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s1, s5, s1 +; GFX9-NEXT: s_sub_i32 s5, s1, s7 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s1, s5, s1 +; GFX9-NEXT: s_xor_b32 s1, s1, s4 +; GFX9-NEXT: s_sub_i32 s1, s1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = srem <2 x i32> %x, %shl.y @@ -7066,7 +7066,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: udiv_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x64c139ef ; GFX6-NEXT: v_mov_b32_e32 v0, 0x38f83e5 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 @@ -7096,27 +7096,27 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s0, s6, 0x38f83e5 -; GFX9-NEXT: s_mul_i32 s1, s6, 0x38f83e5 -; GFX9-NEXT: s_mul_i32 s3, s7, 0x64c139ef -; GFX9-NEXT: s_mul_hi_u32 s6, s6, 0x64c139ef -; GFX9-NEXT: s_mul_hi_u32 s2, s7, 0x64c139ef -; GFX9-NEXT: s_add_u32 s3, s3, s6 -; GFX9-NEXT: s_addc_u32 s2, s2, 0 -; GFX9-NEXT: s_add_u32 s1, s1, s3 -; GFX9-NEXT: s_addc_u32 s0, s0, 0 -; GFX9-NEXT: s_add_u32 s0, s2, s0 -; GFX9-NEXT: s_addc_u32 s1, 0, 0 -; GFX9-NEXT: s_mul_i32 s3, s7, 0x38f83e5 -; GFX9-NEXT: s_mul_hi_u32 s2, s7, 0x38f83e5 -; GFX9-NEXT: s_add_u32 s0, s3, s0 -; GFX9-NEXT: s_addc_u32 s0, s2, s1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0x38f83e5 +; GFX9-NEXT: s_mul_i32 s5, s2, 0x38f83e5 +; GFX9-NEXT: s_mul_i32 s7, s3, 0x64c139ef +; GFX9-NEXT: s_mul_hi_u32 s2, s2, 0x64c139ef +; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0x64c139ef +; GFX9-NEXT: s_add_u32 s2, s7, s2 +; GFX9-NEXT: s_addc_u32 s6, s6, 0 +; GFX9-NEXT: s_add_u32 s2, s5, s2 +; GFX9-NEXT: s_addc_u32 s2, s4, 0 +; GFX9-NEXT: s_add_u32 s2, s6, s2 +; GFX9-NEXT: s_addc_u32 s4, 0, 0 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, 0x38f83e5 +; GFX9-NEXT: s_mul_i32 s3, s3, 0x38f83e5 +; GFX9-NEXT: s_add_u32 s2, s3, s2 +; GFX9-NEXT: s_addc_u32 s2, s5, s4 +; GFX9-NEXT: s_lshr_b32 s2, s2, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i64 %x, 1235195949943 store i64 %r, ptr addrspace(1) %out @@ -7131,7 +7131,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: udiv_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7145,13 +7145,13 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i64 %x, 4096 store i64 %r, ptr addrspace(1) %out @@ -7167,31 +7167,31 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: udiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s8, s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dword s8, s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_add_i32 s8, s8, 12 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], s8 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, 12 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_add_i32 s6, s6, 12 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = udiv i64 %x, %shl.y @@ -7212,33 +7212,33 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: udiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7258,32 +7258,32 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x10010011 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100100 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0 -; GFX6-NEXT: s_mul_i32 s9, s7, 0x10010011 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, s9, v3 -; GFX6-NEXT: s_mul_i32 s8, s6, 0x100100 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0 +; GFX6-NEXT: s_mul_i32 s7, s11, 0x10010011 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GFX6-NEXT: s_mul_i32 s6, s10, 0x100100 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, s8, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX6-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc -; GFX6-NEXT: s_mul_i32 s8, s7, 0x100100 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, s8, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 +; GFX6-NEXT: v_addc_u32_e64 v2, s[6:7], 0, 0, vcc +; GFX6-NEXT: s_mul_i32 s6, s11, 0x100100 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s6, v1 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v0, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s10, v3 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[8:9], 12 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: v_lshr_b64 v[2:3], v[0:1], 11 @@ -7295,37 +7295,37 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 -; GFX9-NEXT: s_mul_i32 s9, s7, 0x10010011 -; GFX9-NEXT: s_mul_hi_u32 s10, s6, 0x10010011 -; GFX9-NEXT: s_mul_hi_u32 s8, s7, 0x10010011 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 +; GFX9-NEXT: s_mul_i32 s9, s3, 0x10010011 +; GFX9-NEXT: s_mul_hi_u32 s10, s2, 0x10010011 +; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x10010011 ; GFX9-NEXT: s_add_u32 s9, s9, s10 -; GFX9-NEXT: s_mul_i32 s5, s6, 0x100100 +; GFX9-NEXT: s_mul_i32 s5, s2, 0x100100 ; GFX9-NEXT: s_addc_u32 s8, s8, 0 -; GFX9-NEXT: s_mul_hi_u32 s4, s6, 0x100100 +; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0x100100 ; GFX9-NEXT: s_add_u32 s5, s5, s9 ; GFX9-NEXT: s_addc_u32 s4, s4, 0 ; GFX9-NEXT: s_add_u32 s4, s8, s4 ; GFX9-NEXT: s_addc_u32 s5, 0, 0 -; GFX9-NEXT: s_mul_i32 s9, s7, 0x100100 -; GFX9-NEXT: s_mul_hi_u32 s8, s7, 0x100100 -; GFX9-NEXT: s_add_u32 s9, s9, s4 -; GFX9-NEXT: s_addc_u32 s8, s8, s5 -; GFX9-NEXT: s_sub_u32 s4, s6, s9 -; GFX9-NEXT: s_subb_u32 s5, s7, s8 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX9-NEXT: s_add_u32 s4, s4, s9 -; GFX9-NEXT: s_addc_u32 s5, s5, s8 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 11 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_mul_i32 s9, s3, 0x100100 +; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x100100 +; GFX9-NEXT: s_add_u32 s4, s9, s4 +; GFX9-NEXT: s_addc_u32 s5, s8, s5 +; GFX9-NEXT: s_sub_u32 s2, s2, s4 +; GFX9-NEXT: s_subb_u32 s3, s3, s5 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX9-NEXT: s_add_u32 s2, s2, s4 +; GFX9-NEXT: s_addc_u32 s3, s3, s5 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 11 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7348,32 +7348,32 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s8, 12 -; GFX6-NEXT: s_add_i32 s2, s10, 12 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: s_add_i32 s4, s12, 12 +; GFX6-NEXT: s_add_i32 s6, s14, 12 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[8:9], s4 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s2, s8, 12 -; GFX9-NEXT: s_add_i32 s8, s10, 12 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 +; GFX9-NEXT: s_add_i32 s2, s12, 12 +; GFX9-NEXT: s_add_i32 s4, s14, 12 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -7394,7 +7394,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: urem_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v2, 0xf6841139 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xe3e10011 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 @@ -7432,34 +7432,34 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s7, 0xf6841139 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, 0xf6841139 -; GFX9-NEXT: s_mul_hi_u32 s2, s7, 0xf6841139 -; GFX9-NEXT: s_add_u32 s3, s3, s8 -; GFX9-NEXT: s_mul_i32 s1, s6, 0xe3e10011 -; GFX9-NEXT: s_addc_u32 s2, s2, 0 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, 0xe3e10011 -; GFX9-NEXT: s_add_u32 s1, s1, s3 -; GFX9-NEXT: s_addc_u32 s0, s0, 0 -; GFX9-NEXT: s_add_u32 s0, s2, s0 -; GFX9-NEXT: s_addc_u32 s1, 0, 0 -; GFX9-NEXT: s_mul_i32 s3, s7, 0xe3e10011 -; GFX9-NEXT: s_mul_hi_u32 s2, s7, 0xe3e10011 -; GFX9-NEXT: s_add_u32 s0, s3, s0 -; GFX9-NEXT: s_addc_u32 s0, s2, s1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 8 -; GFX9-NEXT: s_mul_i32 s1, s0, 0x11f -; GFX9-NEXT: s_mul_hi_u32 s2, s0, 0x9761f7c9 -; GFX9-NEXT: s_add_i32 s2, s2, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, 0x9761f7c9 -; GFX9-NEXT: s_sub_u32 s0, s6, s0 -; GFX9-NEXT: s_subb_u32 s1, s7, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_mul_i32 s7, s3, 0xf6841139 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, 0xf6841139 +; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xf6841139 +; GFX9-NEXT: s_add_u32 s7, s7, s8 +; GFX9-NEXT: s_mul_i32 s5, s2, 0xe3e10011 +; GFX9-NEXT: s_addc_u32 s6, s6, 0 +; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0xe3e10011 +; GFX9-NEXT: s_add_u32 s5, s5, s7 +; GFX9-NEXT: s_addc_u32 s4, s4, 0 +; GFX9-NEXT: s_add_u32 s4, s6, s4 +; GFX9-NEXT: s_addc_u32 s5, 0, 0 +; GFX9-NEXT: s_mul_i32 s7, s3, 0xe3e10011 +; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xe3e10011 +; GFX9-NEXT: s_add_u32 s4, s7, s4 +; GFX9-NEXT: s_addc_u32 s4, s6, s5 +; GFX9-NEXT: s_lshr_b32 s4, s4, 8 +; GFX9-NEXT: s_mul_i32 s5, s4, 0x11f +; GFX9-NEXT: s_mul_hi_u32 s6, s4, 0x9761f7c9 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, 0x9761f7c9 +; GFX9-NEXT: s_sub_u32 s2, s2, s4 +; GFX9-NEXT: s_subb_u32 s3, s3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i64 %x, 1235195393993 store i64 %r, ptr addrspace(1) %out @@ -7474,7 +7474,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: urem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 @@ -7488,12 +7488,12 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s6, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-NEXT: s_and_b32 s2, s2, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i64 %x, 4096 store i64 %r, ptr addrspace(1) %out @@ -7509,35 +7509,35 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: urem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s8, s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dword s8, s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_lshl_b64 s[4:5], 0x1000, s8 -; GFX6-NEXT: s_add_u32 s4, s4, -1 -; GFX6-NEXT: s_addc_u32 s5, s5, -1 -; GFX6-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 +; GFX6-NEXT: s_add_u32 s0, s0, -1 +; GFX6-NEXT: s_addc_u32 s1, s1, -1 +; GFX6-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX9-NEXT: s_add_u32 s0, s0, -1 -; GFX9-NEXT: s_addc_u32 s1, s1, -1 -; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_lshl_b64 s[4:5], 0x1000, s6 +; GFX9-NEXT: s_add_u32 s4, s4, -1 +; GFX9-NEXT: s_addc_u32 s5, s5, -1 +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = urem i64 %x, %shl.y @@ -7558,32 +7558,32 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: urem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s4, s4, 0xfff -; GFX6-NEXT: s_and_b32 s5, s6, 0xfff -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_and_b32 s0, s0, 0xfff +; GFX6-NEXT: s_and_b32 s1, s2, 0xfff +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0xfff -; GFX9-NEXT: s_and_b32 s3, s6, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX9-NEXT: s_and_b32 s0, s0, 0xfff +; GFX9-NEXT: s_and_b32 s1, s2, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm %r = urem <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7606,40 +7606,40 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s10 -; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 -; GFX6-NEXT: s_add_u32 s2, s2, -1 -; GFX6-NEXT: s_addc_u32 s3, s3, -1 -; GFX6-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; GFX6-NEXT: s_add_u32 s0, s0, -1 -; GFX6-NEXT: s_addc_u32 s1, s1, -1 -; GFX6-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: s_lshl_b64 s[4:5], 0x1000, s14 +; GFX6-NEXT: s_lshl_b64 s[6:7], 0x1000, s12 +; GFX6-NEXT: s_add_u32 s6, s6, -1 +; GFX6-NEXT: s_addc_u32 s7, s7, -1 +; GFX6-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7] +; GFX6-NEXT: s_add_u32 s4, s4, -1 +; GFX6-NEXT: s_addc_u32 s5, s5, -1 +; GFX6-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s10 -; GFX9-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 -; GFX9-NEXT: s_add_u32 s8, s8, -1 -; GFX9-NEXT: s_addc_u32 s9, s9, -1 -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s14 +; GFX9-NEXT: s_lshl_b64 s[4:5], 0x1000, s12 +; GFX9-NEXT: s_add_u32 s4, s4, -1 +; GFX9-NEXT: s_addc_u32 s5, s5, -1 +; GFX9-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] ; GFX9-NEXT: s_add_u32 s2, s2, -1 ; GFX9-NEXT: s_addc_u32 s3, s3, -1 -; GFX9-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] +; GFX9-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -7660,7 +7660,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: sdiv_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v2, 0xfd81e19 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x6ca94220 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 @@ -7700,39 +7700,39 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: sdiv_i64_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s0, s6, 0x6ca94220 -; GFX9-NEXT: s_mul_i32 s1, s6, 0x6ca94220 -; GFX9-NEXT: s_mul_i32 s3, s7, 0xfd81e19 -; GFX9-NEXT: s_mul_hi_u32 s6, s6, 0xfd81e19 -; GFX9-NEXT: s_mul_hi_u32 s2, s7, 0xfd81e19 -; GFX9-NEXT: s_add_u32 s3, s3, s6 -; GFX9-NEXT: s_addc_u32 s2, s2, 0 -; GFX9-NEXT: s_add_u32 s1, s1, s3 -; GFX9-NEXT: s_addc_u32 s0, s0, 0 -; GFX9-NEXT: s_add_u32 s0, s2, s0 -; GFX9-NEXT: s_addc_u32 s1, 0, 0 -; GFX9-NEXT: s_mul_i32 s3, s7, 0x6ca94220 -; GFX9-NEXT: s_mul_hi_u32 s2, s7, 0x6ca94220 -; GFX9-NEXT: s_add_u32 s0, s3, s0 -; GFX9-NEXT: s_addc_u32 s1, s2, s1 -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_mul_i32 s3, s2, 0x6ca94220 -; GFX9-NEXT: s_mul_hi_u32 s6, s2, 0xfd81e19 -; GFX9-NEXT: s_add_i32 s3, s6, s3 -; GFX9-NEXT: s_mul_i32 s2, s2, 0xfd81e19 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, s1, s3 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 19 -; GFX9-NEXT: s_lshr_b32 s0, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s2, s0 -; GFX9-NEXT: s_addc_u32 s1, s3, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0x6ca94220 +; GFX9-NEXT: s_mul_i32 s5, s2, 0x6ca94220 +; GFX9-NEXT: s_mul_i32 s7, s3, 0xfd81e19 +; GFX9-NEXT: s_mul_hi_u32 s2, s2, 0xfd81e19 +; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xfd81e19 +; GFX9-NEXT: s_add_u32 s2, s7, s2 +; GFX9-NEXT: s_addc_u32 s6, s6, 0 +; GFX9-NEXT: s_add_u32 s2, s5, s2 +; GFX9-NEXT: s_addc_u32 s2, s4, 0 +; GFX9-NEXT: s_add_u32 s2, s6, s2 +; GFX9-NEXT: s_addc_u32 s4, 0, 0 +; GFX9-NEXT: s_mul_i32 s6, s3, 0x6ca94220 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, 0x6ca94220 +; GFX9-NEXT: s_add_u32 s2, s6, s2 +; GFX9-NEXT: s_addc_u32 s4, s5, s4 +; GFX9-NEXT: s_ashr_i32 s3, s3, 31 +; GFX9-NEXT: s_mul_i32 s5, s3, 0x6ca94220 +; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xfd81e19 +; GFX9-NEXT: s_add_i32 s5, s6, s5 +; GFX9-NEXT: s_mul_i32 s3, s3, 0xfd81e19 +; GFX9-NEXT: s_add_i32 s5, s5, s3 +; GFX9-NEXT: s_add_u32 s2, s2, s3 +; GFX9-NEXT: s_addc_u32 s3, s4, s5 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], 19 +; GFX9-NEXT: s_lshr_b32 s2, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i64 %x, 1235195 store i64 %r, ptr addrspace(1) %out @@ -7747,7 +7747,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: sdiv_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7765,17 +7765,17 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: sdiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s7, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: s_addc_u32 s1, s7, 0 -; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_add_u32 s2, s2, s4 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i64 %x, 4096 store i64 %r, ptr addrspace(1) %out @@ -7791,7 +7791,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7803,26 +7803,27 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_sub_u32 s4, 0, s10 ; GFX6-NEXT: s_subb_u32 s5, 0, s11 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s12, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s12 +; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_add_u32 s2, s2, s12 -; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_addc_u32 s3, s3, s12 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 -; GFX6-NEXT: s_addc_u32 s3, s3, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -7844,7 +7845,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 @@ -7927,19 +7927,19 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX9-NEXT: s_ashr_i32 s8, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s8 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s8 -; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX9-NEXT: s_sub_u32 s0, 0, s10 -; GFX9-NEXT: s_subb_u32 s1, 0, s11 +; GFX9-NEXT: s_ashr_i32 s2, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s2 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_u32 s0, 0, s6 +; GFX9-NEXT: s_subb_u32 s1, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -7949,114 +7949,114 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 -; GFX9-NEXT: s_mul_i32 s12, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s14, s0, s3 -; GFX9-NEXT: s_mul_i32 s13, s1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v2 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: s_mul_i32 s12, s0, s4 +; GFX9-NEXT: s_mul_hi_u32 s14, s0, s5 +; GFX9-NEXT: s_mul_i32 s13, s1, s5 ; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s0, s3 +; GFX9-NEXT: s_mul_i32 s15, s0, s5 ; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s3, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s3, s12 -; GFX9-NEXT: s_mul_i32 s3, s3, s12 -; GFX9-NEXT: s_add_u32 s3, s14, s3 +; GFX9-NEXT: s_mul_hi_u32 s14, s5, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s5, s12 +; GFX9-NEXT: s_mul_i32 s5, s5, s12 +; GFX9-NEXT: s_add_u32 s5, s14, s5 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s2, s15 -; GFX9-NEXT: s_mul_i32 s15, s2, s15 -; GFX9-NEXT: s_add_u32 s3, s3, s15 -; GFX9-NEXT: s_mul_hi_u32 s14, s2, s12 -; GFX9-NEXT: s_addc_u32 s3, s13, s16 +; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15 +; GFX9-NEXT: s_mul_i32 s15, s4, s15 +; GFX9-NEXT: s_add_u32 s5, s5, s15 +; GFX9-NEXT: s_mul_hi_u32 s14, s4, s12 +; GFX9-NEXT: s_addc_u32 s5, s13, s16 ; GFX9-NEXT: s_addc_u32 s13, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s2, s12 -; GFX9-NEXT: s_add_u32 s3, s3, s12 +; GFX9-NEXT: s_mul_i32 s12, s4, s12 +; GFX9-NEXT: s_add_u32 s5, s5, s12 ; GFX9-NEXT: s_addc_u32 s12, 0, s13 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s3, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s5, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s12 +; GFX9-NEXT: s_addc_u32 s4, s4, s12 ; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: s_mul_i32 s3, s0, s2 +; GFX9-NEXT: s_mul_i32 s5, s0, s4 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 -; GFX9-NEXT: s_add_i32 s3, s13, s3 +; GFX9-NEXT: s_add_i32 s5, s13, s5 ; GFX9-NEXT: s_mul_i32 s1, s1, s12 -; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_add_i32 s5, s5, s1 ; GFX9-NEXT: s_mul_i32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s2, s0 -; GFX9-NEXT: s_mul_i32 s14, s2, s0 -; GFX9-NEXT: s_mul_i32 s16, s12, s3 +; GFX9-NEXT: s_mul_hi_u32 s13, s4, s0 +; GFX9-NEXT: s_mul_i32 s14, s4, s0 +; GFX9-NEXT: s_mul_i32 s16, s12, s5 ; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s3 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s5 ; GFX9-NEXT: s_add_u32 s0, s0, s16 ; GFX9-NEXT: s_addc_u32 s12, 0, s15 ; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s1, s4, s5 ; GFX9-NEXT: s_addc_u32 s0, s12, s13 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s3, s2, s3 -; GFX9-NEXT: s_add_u32 s0, s0, s3 +; GFX9-NEXT: s_mul_i32 s5, s4, s5 +; GFX9-NEXT: s_add_u32 s0, s0, s5 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s12, s2, s1 -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s1, s7, s2 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] +; GFX9-NEXT: s_addc_u32 s12, s4, s1 +; GFX9-NEXT: s_ashr_i32 s4, s11, 31 +; GFX9-NEXT: s_add_u32 s0, s10, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_addc_u32 s1, s11, s4 +; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s13, v1 -; GFX9-NEXT: s_mul_i32 s1, s6, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s6, s13 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s12 +; GFX9-NEXT: s_mul_i32 s1, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s0, s10, s12 ; GFX9-NEXT: s_add_u32 s1, s14, s1 ; GFX9-NEXT: s_addc_u32 s0, 0, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s7, s13 -; GFX9-NEXT: s_mul_i32 s13, s7, s13 +; GFX9-NEXT: s_mul_hi_u32 s15, s11, s13 +; GFX9-NEXT: s_mul_i32 s13, s11, s13 ; GFX9-NEXT: s_add_u32 s1, s1, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s7, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s11, s12 ; GFX9-NEXT: s_addc_u32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s1, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s7, s12 +; GFX9-NEXT: s_mul_i32 s12, s11, s12 ; GFX9-NEXT: s_add_u32 s12, s0, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s1 -; GFX9-NEXT: s_mul_i32 s0, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s1, s10, s12 +; GFX9-NEXT: s_mul_i32 s0, s6, s13 +; GFX9-NEXT: s_mul_hi_u32 s1, s6, s12 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s11, s12 +; GFX9-NEXT: s_mul_i32 s1, s7, s12 ; GFX9-NEXT: s_add_i32 s14, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s10, s12 +; GFX9-NEXT: s_mul_i32 s1, s6, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_sub_i32 s0, s7, s14 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 +; GFX9-NEXT: s_sub_i32 s0, s11, s14 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s10, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s6, s0, s11 -; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s10, v1 +; GFX9-NEXT: s_subb_u32 s10, s0, s7 +; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s6, v1 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s6, s6, 0 -; GFX9-NEXT: s_cmp_ge_u32 s6, s11 +; GFX9-NEXT: s_subb_u32 s10, s10, 0 +; GFX9-NEXT: s_cmp_ge_u32 s10, s7 ; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 -; GFX9-NEXT: s_cmp_eq_u32 s6, s11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v2 +; GFX9-NEXT: s_cmp_eq_u32 s10, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s12, 1 -; GFX9-NEXT: s_addc_u32 s6, s13, 0 +; GFX9-NEXT: s_addc_u32 s10, s13, 0 ; GFX9-NEXT: s_add_u32 s1, s12, 2 ; GFX9-NEXT: s_addc_u32 s15, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_mov_b32_e32 v4, s15 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s7, s14 -; GFX9-NEXT: s_cmp_ge_u32 s0, s11 +; GFX9-NEXT: s_subb_u32 s0, s11, s14 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s11 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: s_cmp_eq_u32 s0, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -8066,13 +8066,13 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] +; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9] ; GFX9-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = sdiv i64 %x, %shl.y @@ -8093,49 +8093,49 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: sdiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s8, s5, 31 +; GFX6-NEXT: s_ashr_i32 s8, s1, 31 ; GFX6-NEXT: s_lshr_b32 s8, s8, 20 -; GFX6-NEXT: s_add_u32 s4, s4, s8 -; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_ashr_i32 s8, s7, 31 -; GFX6-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 +; GFX6-NEXT: s_add_u32 s0, s0, s8 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 +; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 ; GFX6-NEXT: s_lshr_b32 s8, s8, 20 -; GFX6-NEXT: s_add_u32 s6, s6, s8 -; GFX6-NEXT: s_addc_u32 s7, s7, 0 -; GFX6-NEXT: s_ashr_i64 s[6:7], s[6:7], 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: s_add_u32 s2, s2, s8 +; GFX6-NEXT: s_addc_u32 s3, s3, 0 +; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, 0 -; GFX9-NEXT: s_ashr_i32 s4, s7, 31 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_add_u32 s4, s6, s4 -; GFX9-NEXT: s_addc_u32 s5, s7, 0 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_add_u32 s2, s2, s4 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -8155,45 +8155,45 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x8008009 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080080 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, s7, v2 -; GFX6-NEXT: s_ashr_i32 s8, s5, 31 -; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0 -; GFX6-NEXT: s_mul_i32 s9, s7, 0x8008009 -; GFX6-NEXT: s_lshr_b32 s8, s8, 20 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, s9, v3 -; GFX6-NEXT: s_add_u32 s4, s4, s8 -; GFX6-NEXT: s_mul_i32 s8, s6, 0x80080080 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, s11, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0 +; GFX6-NEXT: s_mul_i32 s7, s11, 0x8008009 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GFX6-NEXT: s_mul_i32 s6, s10, 0x80080080 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, s8, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GFX6-NEXT: s_ashr_i32 s4, s9, 31 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: s_lshr_b32 s4, s4, 20 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GFX6-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, vcc -; GFX6-NEXT: s_addc_u32 s5, s5, 0 -; GFX6-NEXT: s_ashr_i32 s9, s7, 31 -; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX6-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX6-NEXT: s_mul_i32 s8, s7, 0x80080080 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, s8, v1 -; GFX6-NEXT: s_mul_i32 s8, s9, 0x80080080 +; GFX6-NEXT: s_add_u32 s4, s8, s4 +; GFX6-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc +; GFX6-NEXT: s_addc_u32 s5, s9, 0 +; GFX6-NEXT: s_ashr_i32 s7, s11, 31 +; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 +; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX6-NEXT: s_mul_i32 s6, s11, 0x80080080 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GFX6-NEXT: s_mul_i32 s6, s7, 0x80080080 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s8, v2 -; GFX6-NEXT: s_mul_i32 s8, s9, 0x8008009 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s8, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 -; GFX6-NEXT: v_mov_b32_e32 v4, s7 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GFX6-NEXT: s_mul_i32 s6, s7, 0x8008009 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s11 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v3 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v0, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s6, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s10, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; GFX6-NEXT: v_ashr_i64 v[2:3], v[0:1], 11 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 @@ -8208,51 +8208,51 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, 0 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 -; GFX9-NEXT: s_mul_i32 s5, s7, 0x8008009 -; GFX9-NEXT: s_mul_hi_u32 s10, s6, 0x8008009 -; GFX9-NEXT: s_mul_hi_u32 s4, s7, 0x8008009 -; GFX9-NEXT: s_add_u32 s5, s5, s10 -; GFX9-NEXT: s_mul_i32 s9, s6, 0x80080080 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GFX9-NEXT: s_mul_i32 s9, s3, 0x8008009 +; GFX9-NEXT: s_mul_hi_u32 s10, s2, 0x8008009 +; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x8008009 +; GFX9-NEXT: s_add_u32 s9, s9, s10 +; GFX9-NEXT: s_mul_i32 s8, s2, 0x80080080 ; GFX9-NEXT: s_addc_u32 s4, s4, 0 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, 0x80080080 -; GFX9-NEXT: s_add_u32 s5, s9, s5 -; GFX9-NEXT: s_addc_u32 s5, s8, 0 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, 0x80080080 +; GFX9-NEXT: s_add_u32 s8, s8, s9 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 ; GFX9-NEXT: s_add_u32 s4, s4, s5 ; GFX9-NEXT: s_addc_u32 s5, 0, 0 -; GFX9-NEXT: s_mul_i32 s9, s7, 0x80080080 -; GFX9-NEXT: s_mul_hi_u32 s8, s7, 0x80080080 +; GFX9-NEXT: s_mul_i32 s9, s3, 0x80080080 +; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x80080080 ; GFX9-NEXT: s_add_u32 s4, s9, s4 ; GFX9-NEXT: s_addc_u32 s5, s8, s5 -; GFX9-NEXT: s_ashr_i32 s8, s7, 31 +; GFX9-NEXT: s_ashr_i32 s8, s3, 31 ; GFX9-NEXT: s_mul_i32 s9, s8, 0x80080080 ; GFX9-NEXT: s_mul_hi_u32 s10, s8, 0x8008009 ; GFX9-NEXT: s_add_i32 s9, s10, s9 ; GFX9-NEXT: s_mul_i32 s8, s8, 0x8008009 ; GFX9-NEXT: s_add_i32 s9, s9, s8 -; GFX9-NEXT: s_sub_u32 s8, s8, s6 -; GFX9-NEXT: s_subb_u32 s9, s9, s7 +; GFX9-NEXT: s_sub_u32 s8, s8, s2 +; GFX9-NEXT: s_subb_u32 s9, s9, s3 ; GFX9-NEXT: s_add_u32 s4, s4, s8 ; GFX9-NEXT: s_addc_u32 s5, s5, s9 -; GFX9-NEXT: s_add_u32 s4, s4, s6 -; GFX9-NEXT: s_addc_u32 s5, s5, s7 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 11 -; GFX9-NEXT: s_lshr_b32 s4, s5, 31 -; GFX9-NEXT: s_add_u32 s4, s6, s4 -; GFX9-NEXT: s_addc_u32 s5, s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], 11 +; GFX9-NEXT: s_lshr_b32 s2, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s4, s2 +; GFX9-NEXT: s_addc_u32 s3, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -8275,26 +8275,26 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 -; GFX6-NEXT: s_lshl_b64 s[14:15], 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s16, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s16 -; GFX6-NEXT: s_mov_b32 s17, s16 -; GFX6-NEXT: s_addc_u32 s1, s1, s16 -; GFX6-NEXT: s_xor_b64 s[12:13], s[0:1], s[16:17] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX6-NEXT: s_sub_u32 s0, 0, s12 -; GFX6-NEXT: s_subb_u32 s1, 0, s13 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s12 +; GFX6-NEXT: s_lshl_b64 s[14:15], 0x1000, s14 +; GFX6-NEXT: s_ashr_i32 s12, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s12 +; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_addc_u32 s1, s1, s12 +; GFX6-NEXT: s_xor_b64 s[2:3], s[0:1], s[12:13] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX6-NEXT: s_sub_u32 s0, 0, s2 +; GFX6-NEXT: s_subb_u32 s1, 0, s3 +; GFX6-NEXT: s_ashr_i32 s16, s9, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s17, s16 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -8343,42 +8343,42 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: s_add_u32 s0, s4, s2 +; GFX6-NEXT: s_add_u32 s0, s8, s16 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s5, s2 +; GFX6-NEXT: s_addc_u32 s1, s9, s16 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[2:3] -; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s5, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s5, v1 +; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[16:17] +; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s8, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s9, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s9, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -8387,23 +8387,23 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] -; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[16:17] -; GFX6-NEXT: s_ashr_i32 s2, s15, 31 -; GFX6-NEXT: s_add_u32 s4, s14, s2 -; GFX6-NEXT: v_mov_b32_e32 v6, s5 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s5, s15, s2 -; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[12:13] +; GFX6-NEXT: s_ashr_i32 s8, s15, 31 +; GFX6-NEXT: s_add_u32 s12, s14, s8 +; GFX6-NEXT: v_mov_b32_e32 v6, s9 +; GFX6-NEXT: s_mov_b32 s9, s8 +; GFX6-NEXT: s_addc_u32 s13, s15, s8 +; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[8:9] ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s13 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GFX6-NEXT: v_rcp_f32_e32 v6, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 @@ -8412,16 +8412,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: s_sub_u32 s12, 0, s4 +; GFX6-NEXT: s_sub_u32 s2, 0, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v3 -; GFX6-NEXT: s_subb_u32 s13, 0, s5 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX6-NEXT: s_subb_u32 s3, 0, s13 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 @@ -8440,11 +8440,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 @@ -8459,48 +8459,48 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: s_ashr_i32 s12, s7, 31 +; GFX6-NEXT: s_ashr_i32 s2, s11, 31 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s6, s6, s12 +; GFX6-NEXT: s_add_u32 s10, s10, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_addc_u32 s7, s7, s12 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s11, s11, s2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[12:13] -; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, s7, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s7, v3 +; GFX6-NEXT: s_xor_b64 s[10:11], s[10:11], s[2:3] +; GFX6-NEXT: v_mul_lo_u32 v4, s10, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s10, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, s10, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, s11, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v7, s7, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX6-NEXT: v_mul_lo_u32 v7, s11, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, s1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s4, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, s5, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, s5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s11, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, s13 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s10, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s4, v5 +; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v6 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v7 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] @@ -8509,34 +8509,34 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v7, v9, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v8, s7 +; GFX6-NEXT: v_mov_b32_e32 v8, s11 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[2:3] +; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, s1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 -; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s12 +; GFX9-NEXT: s_lshl_b64 s[6:7], 0x1000, s14 ; GFX9-NEXT: s_ashr_i32 s12, s1, 31 ; GFX9-NEXT: s_add_u32 s0, s0, s12 ; GFX9-NEXT: s_mov_b32 s13, s12 @@ -8544,7 +8544,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_xor_b64 s[14:15], s[0:1], s[12:13] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s14 ; GFX9-NEXT: s_subb_u32 s1, 0, s15 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 @@ -8555,73 +8554,73 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s16, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s18, s0, s3 -; GFX9-NEXT: s_mul_i32 s17, s1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s16, s0, s4 +; GFX9-NEXT: s_mul_hi_u32 s18, s0, s5 +; GFX9-NEXT: s_mul_i32 s17, s1, s5 ; GFX9-NEXT: s_add_i32 s16, s18, s16 -; GFX9-NEXT: s_mul_i32 s19, s0, s3 +; GFX9-NEXT: s_mul_i32 s19, s0, s5 ; GFX9-NEXT: s_add_i32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s17, s3, s16 -; GFX9-NEXT: s_mul_i32 s18, s3, s16 -; GFX9-NEXT: s_mul_hi_u32 s3, s3, s19 -; GFX9-NEXT: s_add_u32 s3, s3, s18 +; GFX9-NEXT: s_mul_hi_u32 s17, s5, s16 +; GFX9-NEXT: s_mul_i32 s18, s5, s16 +; GFX9-NEXT: s_mul_hi_u32 s5, s5, s19 +; GFX9-NEXT: s_add_u32 s5, s5, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_mul_hi_u32 s20, s2, s19 -; GFX9-NEXT: s_mul_i32 s19, s2, s19 -; GFX9-NEXT: s_add_u32 s3, s3, s19 -; GFX9-NEXT: s_mul_hi_u32 s18, s2, s16 -; GFX9-NEXT: s_addc_u32 s3, s17, s20 +; GFX9-NEXT: s_mul_hi_u32 s20, s4, s19 +; GFX9-NEXT: s_mul_i32 s19, s4, s19 +; GFX9-NEXT: s_add_u32 s5, s5, s19 +; GFX9-NEXT: s_mul_hi_u32 s18, s4, s16 +; GFX9-NEXT: s_addc_u32 s5, s17, s20 ; GFX9-NEXT: s_addc_u32 s17, s18, 0 -; GFX9-NEXT: s_mul_i32 s16, s2, s16 -; GFX9-NEXT: s_add_u32 s3, s3, s16 +; GFX9-NEXT: s_mul_i32 s16, s4, s16 +; GFX9-NEXT: s_add_u32 s5, s5, s16 ; GFX9-NEXT: s_addc_u32 s16, 0, s17 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s3, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s16 +; GFX9-NEXT: s_addc_u32 s4, s4, s16 ; GFX9-NEXT: v_readfirstlane_b32 s16, v0 -; GFX9-NEXT: s_mul_i32 s3, s0, s2 +; GFX9-NEXT: s_mul_i32 s5, s0, s4 ; GFX9-NEXT: s_mul_hi_u32 s17, s0, s16 -; GFX9-NEXT: s_add_i32 s3, s17, s3 +; GFX9-NEXT: s_add_i32 s5, s17, s5 ; GFX9-NEXT: s_mul_i32 s1, s1, s16 -; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_add_i32 s5, s5, s1 ; GFX9-NEXT: s_mul_i32 s0, s0, s16 -; GFX9-NEXT: s_mul_hi_u32 s17, s2, s0 -; GFX9-NEXT: s_mul_i32 s18, s2, s0 -; GFX9-NEXT: s_mul_i32 s20, s16, s3 +; GFX9-NEXT: s_mul_hi_u32 s17, s4, s0 +; GFX9-NEXT: s_mul_i32 s18, s4, s0 +; GFX9-NEXT: s_mul_i32 s20, s16, s5 ; GFX9-NEXT: s_mul_hi_u32 s0, s16, s0 -; GFX9-NEXT: s_mul_hi_u32 s19, s16, s3 +; GFX9-NEXT: s_mul_hi_u32 s19, s16, s5 ; GFX9-NEXT: s_add_u32 s0, s0, s20 ; GFX9-NEXT: s_addc_u32 s16, 0, s19 ; GFX9-NEXT: s_add_u32 s0, s0, s18 -; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s1, s4, s5 ; GFX9-NEXT: s_addc_u32 s0, s16, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s3, s2, s3 -; GFX9-NEXT: s_add_u32 s0, s0, s3 +; GFX9-NEXT: s_mul_i32 s5, s4, s5 +; GFX9-NEXT: s_add_u32 s0, s0, s5 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s16, s2, s1 -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s0, s4, s2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s1, s5, s2 -; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], s[2:3] +; GFX9-NEXT: s_addc_u32 s16, s4, s1 +; GFX9-NEXT: s_ashr_i32 s4, s9, 31 +; GFX9-NEXT: s_add_u32 s0, s8, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_addc_u32 s1, s9, s4 +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s17, v0 -; GFX9-NEXT: s_mul_i32 s1, s4, s16 -; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17 -; GFX9-NEXT: s_mul_hi_u32 s0, s4, s16 +; GFX9-NEXT: s_mul_i32 s1, s8, s16 +; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17 +; GFX9-NEXT: s_mul_hi_u32 s0, s8, s16 ; GFX9-NEXT: s_add_u32 s1, s18, s1 ; GFX9-NEXT: s_addc_u32 s0, 0, s0 -; GFX9-NEXT: s_mul_hi_u32 s19, s5, s17 -; GFX9-NEXT: s_mul_i32 s17, s5, s17 +; GFX9-NEXT: s_mul_hi_u32 s19, s9, s17 +; GFX9-NEXT: s_mul_i32 s17, s9, s17 ; GFX9-NEXT: s_add_u32 s1, s1, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s5, s16 +; GFX9-NEXT: s_mul_hi_u32 s18, s9, s16 ; GFX9-NEXT: s_addc_u32 s0, s0, s19 ; GFX9-NEXT: s_addc_u32 s1, s18, 0 -; GFX9-NEXT: s_mul_i32 s16, s5, s16 +; GFX9-NEXT: s_mul_i32 s16, s9, s16 ; GFX9-NEXT: s_add_u32 s16, s0, s16 ; GFX9-NEXT: s_addc_u32 s17, 0, s1 ; GFX9-NEXT: s_mul_i32 s0, s14, s17 @@ -8631,34 +8630,34 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_add_i32 s18, s0, s1 ; GFX9-NEXT: s_mul_i32 s1, s14, s16 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_sub_i32 s0, s5, s18 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: s_sub_i32 s0, s9, s18 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s4, s0, s15 +; GFX9-NEXT: s_subb_u32 s8, s0, s15 ; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s14, v0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s4, s4, 0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s15 +; GFX9-NEXT: s_subb_u32 s8, s8, 0 +; GFX9-NEXT: s_cmp_ge_u32 s8, s15 ; GFX9-NEXT: s_cselect_b32 s19, -1, 0 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v1 -; GFX9-NEXT: s_cmp_eq_u32 s4, s15 +; GFX9-NEXT: s_cmp_eq_u32 s8, s15 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s19 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s16, 1 -; GFX9-NEXT: s_addc_u32 s4, s17, 0 +; GFX9-NEXT: s_addc_u32 s8, s17, 0 ; GFX9-NEXT: s_add_u32 s1, s16, 2 ; GFX9-NEXT: s_addc_u32 s19, s17, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s5, s18 +; GFX9-NEXT: s_subb_u32 s0, s9, s18 ; GFX9-NEXT: s_cmp_ge_u32 s0, s15 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v0 @@ -8666,28 +8665,28 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] -; GFX9-NEXT: s_ashr_i32 s2, s11, 31 +; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[12:13] +; GFX9-NEXT: s_ashr_i32 s4, s7, 31 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: s_add_u32 s4, s10, s2 +; GFX9-NEXT: s_add_u32 s6, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s5, s11, s2 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_addc_u32 s7, s7, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 ; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX9-NEXT: v_xor_b32_e32 v5, s1, v0 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v1 ; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: s_sub_u32 s0, 0, s4 +; GFX9-NEXT: s_sub_u32 s0, 0, s6 ; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: s_subb_u32 s1, 0, s5 +; GFX9-NEXT: s_subb_u32 s1, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -8695,114 +8694,114 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v6, vcc -; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 ; GFX9-NEXT: v_readfirstlane_b32 s13, v3 -; GFX9-NEXT: s_mul_hi_u32 s12, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s12, s0, s8 ; GFX9-NEXT: s_mul_i32 s14, s0, s13 -; GFX9-NEXT: s_mul_i32 s11, s1, s10 +; GFX9-NEXT: s_mul_i32 s9, s1, s8 ; GFX9-NEXT: s_add_i32 s12, s12, s14 -; GFX9-NEXT: s_add_i32 s12, s12, s11 -; GFX9-NEXT: s_mul_i32 s15, s0, s10 -; GFX9-NEXT: s_mul_hi_u32 s11, s10, s12 -; GFX9-NEXT: s_mul_i32 s14, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s10, s10, s15 -; GFX9-NEXT: s_add_u32 s10, s10, s14 -; GFX9-NEXT: s_addc_u32 s11, 0, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s9 +; GFX9-NEXT: s_mul_i32 s15, s0, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s12 +; GFX9-NEXT: s_mul_i32 s14, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s8, s8, s15 +; GFX9-NEXT: s_add_u32 s8, s8, s14 +; GFX9-NEXT: s_addc_u32 s9, 0, s9 ; GFX9-NEXT: s_mul_hi_u32 s16, s13, s15 ; GFX9-NEXT: s_mul_i32 s15, s13, s15 -; GFX9-NEXT: s_add_u32 s10, s10, s15 +; GFX9-NEXT: s_add_u32 s8, s8, s15 ; GFX9-NEXT: s_mul_hi_u32 s14, s13, s12 -; GFX9-NEXT: s_addc_u32 s10, s11, s16 -; GFX9-NEXT: s_addc_u32 s11, s14, 0 +; GFX9-NEXT: s_addc_u32 s8, s9, s16 +; GFX9-NEXT: s_addc_u32 s9, s14, 0 ; GFX9-NEXT: s_mul_i32 s12, s13, s12 -; GFX9-NEXT: s_add_u32 s10, s10, s12 -; GFX9-NEXT: s_addc_u32 s11, 0, s11 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s10, v2 +; GFX9-NEXT: s_add_u32 s8, s8, s12 +; GFX9-NEXT: s_addc_u32 s9, 0, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s10, s13, s11 +; GFX9-NEXT: s_addc_u32 s8, s13, s9 ; GFX9-NEXT: v_readfirstlane_b32 s12, v2 -; GFX9-NEXT: s_mul_i32 s11, s0, s10 +; GFX9-NEXT: s_mul_i32 s9, s0, s8 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 -; GFX9-NEXT: s_add_i32 s11, s13, s11 +; GFX9-NEXT: s_add_i32 s9, s13, s9 ; GFX9-NEXT: s_mul_i32 s1, s1, s12 -; GFX9-NEXT: s_add_i32 s11, s11, s1 +; GFX9-NEXT: s_add_i32 s9, s9, s1 ; GFX9-NEXT: s_mul_i32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0 -; GFX9-NEXT: s_mul_i32 s14, s10, s0 -; GFX9-NEXT: s_mul_i32 s16, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s13, s8, s0 +; GFX9-NEXT: s_mul_i32 s14, s8, s0 +; GFX9-NEXT: s_mul_i32 s16, s12, s9 ; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s9 ; GFX9-NEXT: s_add_u32 s0, s0, s16 ; GFX9-NEXT: s_addc_u32 s12, 0, s15 ; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s9 ; GFX9-NEXT: s_addc_u32 s0, s12, s13 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s11, s10, s11 -; GFX9-NEXT: s_add_u32 s0, s0, s11 +; GFX9-NEXT: s_mul_i32 s9, s8, s9 +; GFX9-NEXT: s_add_u32 s0, s0, s9 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s12, s10, s1 -; GFX9-NEXT: s_ashr_i32 s10, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s10 -; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX9-NEXT: s_addc_u32 s12, s8, s1 +; GFX9-NEXT: s_ashr_i32 s8, s11, 31 +; GFX9-NEXT: s_add_u32 s0, s10, s8 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_addc_u32 s1, s11, s8 +; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GFX9-NEXT: v_readfirstlane_b32 s13, v2 -; GFX9-NEXT: s_mul_i32 s1, s6, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s6, s13 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s12 +; GFX9-NEXT: s_mul_i32 s1, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s0, s10, s12 ; GFX9-NEXT: s_add_u32 s1, s14, s1 ; GFX9-NEXT: s_addc_u32 s0, 0, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s7, s13 -; GFX9-NEXT: s_mul_i32 s13, s7, s13 +; GFX9-NEXT: s_mul_hi_u32 s15, s11, s13 +; GFX9-NEXT: s_mul_i32 s13, s11, s13 ; GFX9-NEXT: s_add_u32 s1, s1, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s7, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s11, s12 ; GFX9-NEXT: s_addc_u32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s1, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s7, s12 +; GFX9-NEXT: s_mul_i32 s12, s11, s12 ; GFX9-NEXT: s_add_u32 s12, s0, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s1 -; GFX9-NEXT: s_mul_i32 s0, s4, s13 -; GFX9-NEXT: s_mul_hi_u32 s1, s4, s12 +; GFX9-NEXT: s_mul_i32 s0, s6, s13 +; GFX9-NEXT: s_mul_hi_u32 s1, s6, s12 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s5, s12 +; GFX9-NEXT: s_mul_i32 s1, s7, s12 ; GFX9-NEXT: s_add_i32 s14, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s4, s12 +; GFX9-NEXT: s_mul_i32 s1, s6, s12 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_sub_i32 s0, s7, s14 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 +; GFX9-NEXT: s_sub_i32 s0, s11, s14 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s6, s0, s5 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s4, v2 +; GFX9-NEXT: s_subb_u32 s10, s0, s7 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s6, v2 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s6, s6, 0 -; GFX9-NEXT: s_cmp_ge_u32 s6, s5 +; GFX9-NEXT: s_subb_u32 s10, s10, 0 +; GFX9-NEXT: s_cmp_ge_u32 s10, s7 ; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 -; GFX9-NEXT: s_cmp_eq_u32 s6, s5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3 +; GFX9-NEXT: s_cmp_eq_u32 s10, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, s15 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s12, 1 -; GFX9-NEXT: s_addc_u32 s6, s13, 0 +; GFX9-NEXT: s_addc_u32 s10, s13, 0 ; GFX9-NEXT: s_add_u32 s1, s12, 2 ; GFX9-NEXT: s_addc_u32 s15, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s0 ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 ; GFX9-NEXT: v_mov_b32_e32 v6, s15 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s7, s14 -; GFX9-NEXT: s_cmp_ge_u32 s0, s5 +; GFX9-NEXT: s_subb_u32 s0, s11, s14 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 -; GFX9-NEXT: s_cmp_eq_u32 s0, s5 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 +; GFX9-NEXT: s_cmp_eq_u32 s0, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -8812,14 +8811,13 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v5, vcc ; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] +; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], s[4:5] ; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3 ; GFX9-NEXT: v_xor_b32_e32 v5, s1, v2 ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = sdiv <2 x i64> %x, %shl.y @@ -8835,7 +8833,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: srem_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v2, 0xfd81e19 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x6ca94220 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -8883,45 +8881,45 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: srem_i64_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s7, 0xfd81e19 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, 0xfd81e19 -; GFX9-NEXT: s_mul_hi_u32 s2, s7, 0xfd81e19 -; GFX9-NEXT: s_add_u32 s3, s3, s8 -; GFX9-NEXT: s_mul_i32 s1, s6, 0x6ca94220 -; GFX9-NEXT: s_addc_u32 s2, s2, 0 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, 0x6ca94220 -; GFX9-NEXT: s_add_u32 s1, s1, s3 -; GFX9-NEXT: s_addc_u32 s0, s0, 0 -; GFX9-NEXT: s_add_u32 s0, s2, s0 -; GFX9-NEXT: s_addc_u32 s1, 0, 0 -; GFX9-NEXT: s_mul_i32 s3, s7, 0x6ca94220 -; GFX9-NEXT: s_mul_hi_u32 s2, s7, 0x6ca94220 -; GFX9-NEXT: s_add_u32 s0, s3, s0 -; GFX9-NEXT: s_addc_u32 s1, s2, s1 -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_mul_i32 s3, s2, 0x6ca94220 +; GFX9-NEXT: s_mul_i32 s7, s3, 0xfd81e19 ; GFX9-NEXT: s_mul_hi_u32 s8, s2, 0xfd81e19 -; GFX9-NEXT: s_add_i32 s3, s8, s3 -; GFX9-NEXT: s_mul_i32 s2, s2, 0xfd81e19 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, s1, s3 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 19 -; GFX9-NEXT: s_lshr_b32 s0, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s2, s0 -; GFX9-NEXT: s_addc_u32 s1, s3, 0 -; GFX9-NEXT: s_mul_i32 s1, s1, 0x12d8fb -; GFX9-NEXT: s_mul_hi_u32 s2, s0, 0x12d8fb -; GFX9-NEXT: s_add_i32 s2, s2, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb -; GFX9-NEXT: s_sub_u32 s0, s6, s0 -; GFX9-NEXT: s_subb_u32 s1, s7, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xfd81e19 +; GFX9-NEXT: s_add_u32 s7, s7, s8 +; GFX9-NEXT: s_mul_i32 s5, s2, 0x6ca94220 +; GFX9-NEXT: s_addc_u32 s6, s6, 0 +; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0x6ca94220 +; GFX9-NEXT: s_add_u32 s5, s5, s7 +; GFX9-NEXT: s_addc_u32 s4, s4, 0 +; GFX9-NEXT: s_add_u32 s4, s6, s4 +; GFX9-NEXT: s_addc_u32 s5, 0, 0 +; GFX9-NEXT: s_mul_i32 s7, s3, 0x6ca94220 +; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0x6ca94220 +; GFX9-NEXT: s_add_u32 s4, s7, s4 +; GFX9-NEXT: s_addc_u32 s5, s6, s5 +; GFX9-NEXT: s_ashr_i32 s6, s3, 31 +; GFX9-NEXT: s_mul_i32 s7, s6, 0x6ca94220 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, 0xfd81e19 +; GFX9-NEXT: s_add_i32 s7, s8, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, 0xfd81e19 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_u32 s4, s4, s6 +; GFX9-NEXT: s_addc_u32 s5, s5, s7 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 19 +; GFX9-NEXT: s_lshr_b32 s4, s5, 31 +; GFX9-NEXT: s_add_u32 s4, s6, s4 +; GFX9-NEXT: s_addc_u32 s5, s7, 0 +; GFX9-NEXT: s_mul_i32 s5, s5, 0x12d8fb +; GFX9-NEXT: s_mul_hi_u32 s6, s4, 0x12d8fb +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, 0x12d8fb +; GFX9-NEXT: s_sub_u32 s2, s2, s4 +; GFX9-NEXT: s_subb_u32 s3, s3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i64 %x, 1235195 store i64 %r, ptr addrspace(1) %out @@ -8936,7 +8934,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: srem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8956,19 +8954,19 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: srem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s7, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: s_addc_u32 s1, s7, 0 -; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 -; GFX9-NEXT: s_sub_u32 s0, s6, s0 -; GFX9-NEXT: s_subb_u32 s1, s7, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_add_u32 s4, s2, s4 +; GFX9-NEXT: s_addc_u32 s5, s3, 0 +; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GFX9-NEXT: s_sub_u32 s2, s2, s4 +; GFX9-NEXT: s_subb_u32 s3, s3, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i64 %x, 4096 store i64 %r, ptr addrspace(1) %out @@ -8984,38 +8982,39 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s4 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_addc_u32 s1, s1, s4 -; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] +; GFX6-NEXT: s_ashr_i32 s2, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s2 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s1, s1, s2 +; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_sub_u32 s4, 0, s8 ; GFX6-NEXT: s_subb_u32 s5, 0, s9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s10, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s10 +; GFX6-NEXT: s_mov_b32 s11, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_add_u32 s2, s2, s10 -; GFX6-NEXT: s_mov_b32 s11, s10 +; GFX6-NEXT: s_addc_u32 s3, s3, s10 +; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 -; GFX6-NEXT: s_addc_u32 s3, s3, s10 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -9037,7 +9036,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 -; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 @@ -9118,19 +9116,19 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s4 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_sub_u32 s0, 0, s8 -; GFX9-NEXT: s_subb_u32 s1, 0, s9 +; GFX9-NEXT: s_ashr_i32 s2, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s2 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_u32 s0, 0, s6 +; GFX9-NEXT: s_subb_u32 s1, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9142,46 +9140,46 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 -; GFX9-NEXT: s_mul_i32 s10, s0, s2 +; GFX9-NEXT: s_mul_i32 s4, s0, s2 ; GFX9-NEXT: s_mul_hi_u32 s12, s0, s3 -; GFX9-NEXT: s_mul_i32 s11, s1, s3 -; GFX9-NEXT: s_add_i32 s10, s12, s10 +; GFX9-NEXT: s_mul_i32 s5, s1, s3 +; GFX9-NEXT: s_add_i32 s4, s12, s4 ; GFX9-NEXT: s_mul_i32 s13, s0, s3 -; GFX9-NEXT: s_add_i32 s10, s10, s11 +; GFX9-NEXT: s_add_i32 s4, s4, s5 ; GFX9-NEXT: s_mul_hi_u32 s12, s3, s13 -; GFX9-NEXT: s_mul_hi_u32 s11, s3, s10 -; GFX9-NEXT: s_mul_i32 s3, s3, s10 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4 +; GFX9-NEXT: s_mul_i32 s3, s3, s4 ; GFX9-NEXT: s_add_u32 s3, s12, s3 -; GFX9-NEXT: s_addc_u32 s11, 0, s11 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 ; GFX9-NEXT: s_mul_hi_u32 s14, s2, s13 ; GFX9-NEXT: s_mul_i32 s13, s2, s13 ; GFX9-NEXT: s_add_u32 s3, s3, s13 -; GFX9-NEXT: s_mul_hi_u32 s12, s2, s10 -; GFX9-NEXT: s_addc_u32 s3, s11, s14 -; GFX9-NEXT: s_addc_u32 s11, s12, 0 -; GFX9-NEXT: s_mul_i32 s10, s2, s10 -; GFX9-NEXT: s_add_u32 s3, s3, s10 -; GFX9-NEXT: s_addc_u32 s10, 0, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s2, s4 +; GFX9-NEXT: s_addc_u32 s3, s5, s14 +; GFX9-NEXT: s_addc_u32 s5, s12, 0 +; GFX9-NEXT: s_mul_i32 s4, s2, s4 +; GFX9-NEXT: s_add_u32 s3, s3, s4 +; GFX9-NEXT: s_addc_u32 s4, 0, s5 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s3, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s10 -; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_addc_u32 s2, s2, s4 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_mul_i32 s3, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s11, s0, s10 -; GFX9-NEXT: s_add_i32 s3, s11, s3 -; GFX9-NEXT: s_mul_i32 s1, s1, s10 +; GFX9-NEXT: s_mul_hi_u32 s5, s0, s4 +; GFX9-NEXT: s_add_i32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s1, s1, s4 ; GFX9-NEXT: s_add_i32 s3, s3, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s10 -; GFX9-NEXT: s_mul_hi_u32 s11, s2, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s4 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s0 ; GFX9-NEXT: s_mul_i32 s12, s2, s0 -; GFX9-NEXT: s_mul_i32 s14, s10, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s10, s0 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s3 +; GFX9-NEXT: s_mul_i32 s14, s4, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s4, s0 +; GFX9-NEXT: s_mul_hi_u32 s13, s4, s3 ; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_addc_u32 s10, 0, s13 +; GFX9-NEXT: s_addc_u32 s4, 0, s13 ; GFX9-NEXT: s_add_u32 s0, s0, s12 ; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 -; GFX9-NEXT: s_addc_u32 s0, s10, s11 +; GFX9-NEXT: s_addc_u32 s0, s4, s5 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: s_add_u32 s0, s0, s3 @@ -9189,52 +9187,51 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: s_addc_u32 s2, s2, s1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s10, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s10 -; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX9-NEXT: s_ashr_i32 s4, s11, 31 +; GFX9-NEXT: s_add_u32 s0, s10, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_addc_u32 s1, s11, s4 +; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 -; GFX9-NEXT: s_mul_i32 s1, s6, s2 -; GFX9-NEXT: s_mul_hi_u32 s11, s6, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s2 -; GFX9-NEXT: s_add_u32 s1, s11, s1 +; GFX9-NEXT: s_mul_i32 s1, s10, s2 +; GFX9-NEXT: s_mul_hi_u32 s5, s10, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s10, s2 +; GFX9-NEXT: s_add_u32 s1, s5, s1 ; GFX9-NEXT: s_addc_u32 s0, 0, s0 -; GFX9-NEXT: s_mul_hi_u32 s12, s7, s3 -; GFX9-NEXT: s_mul_i32 s3, s7, s3 +; GFX9-NEXT: s_mul_hi_u32 s12, s11, s3 +; GFX9-NEXT: s_mul_i32 s3, s11, s3 ; GFX9-NEXT: s_add_u32 s1, s1, s3 -; GFX9-NEXT: s_mul_hi_u32 s11, s7, s2 +; GFX9-NEXT: s_mul_hi_u32 s5, s11, s2 ; GFX9-NEXT: s_addc_u32 s0, s0, s12 -; GFX9-NEXT: s_addc_u32 s1, s11, 0 -; GFX9-NEXT: s_mul_i32 s2, s7, s2 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: s_mul_i32 s2, s11, s2 ; GFX9-NEXT: s_add_u32 s0, s0, s2 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: s_mul_i32 s1, s8, s1 -; GFX9-NEXT: s_mul_hi_u32 s2, s8, s0 +; GFX9-NEXT: s_mul_i32 s1, s6, s1 +; GFX9-NEXT: s_mul_hi_u32 s2, s6, s0 ; GFX9-NEXT: s_add_i32 s1, s2, s1 -; GFX9-NEXT: s_mul_i32 s2, s9, s0 -; GFX9-NEXT: s_mul_i32 s0, s8, s0 -; GFX9-NEXT: s_add_i32 s11, s1, s2 +; GFX9-NEXT: s_mul_i32 s2, s7, s0 +; GFX9-NEXT: s_mul_i32 s0, s6, s0 +; GFX9-NEXT: s_add_i32 s5, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_sub_i32 s1, s7, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 +; GFX9-NEXT: s_sub_i32 s1, s11, s5 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s10, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s6, s1, s9 -; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s8, v1 +; GFX9-NEXT: s_subb_u32 s10, s1, s7 +; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s6, v1 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s12, s6, 0 -; GFX9-NEXT: s_cmp_ge_u32 s12, s9 +; GFX9-NEXT: s_subb_u32 s12, s10, 0 +; GFX9-NEXT: s_cmp_ge_u32 s12, s7 ; GFX9-NEXT: s_cselect_b32 s13, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v2 -; GFX9-NEXT: s_cmp_eq_u32 s12, s9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s6, v2 +; GFX9-NEXT: s_cmp_eq_u32 s12, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s13 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[2:3] -; GFX9-NEXT: s_subb_u32 s2, s6, s9 -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v2 +; GFX9-NEXT: s_subb_u32 s2, s10, s7 +; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s6, v2 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s2, s2, 0 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 @@ -9243,11 +9240,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s7, s11 -; GFX9-NEXT: s_cmp_ge_u32 s0, s9 +; GFX9-NEXT: s_subb_u32 s0, s11, s5 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s9 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 +; GFX9-NEXT: s_cmp_eq_u32 s0, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -9256,12 +9253,12 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_mov_b32_e32 v5, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, s10, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s10, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s4, v1 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9] ; GFX9-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = srem i64 %x, %shl.y @@ -9282,57 +9279,57 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: srem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s8, s5, 31 +; GFX6-NEXT: s_ashr_i32 s8, s1, 31 ; GFX6-NEXT: s_lshr_b32 s8, s8, 20 -; GFX6-NEXT: s_add_u32 s8, s4, s8 -; GFX6-NEXT: s_addc_u32 s9, s5, 0 +; GFX6-NEXT: s_add_u32 s8, s0, s8 +; GFX6-NEXT: s_addc_u32 s9, s1, 0 ; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 -; GFX6-NEXT: s_sub_u32 s4, s4, s8 -; GFX6-NEXT: s_subb_u32 s5, s5, s9 -; GFX6-NEXT: s_ashr_i32 s8, s7, 31 +; GFX6-NEXT: s_sub_u32 s0, s0, s8 +; GFX6-NEXT: s_subb_u32 s1, s1, s9 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 ; GFX6-NEXT: s_lshr_b32 s8, s8, 20 -; GFX6-NEXT: s_add_u32 s8, s6, s8 -; GFX6-NEXT: s_addc_u32 s9, s7, 0 +; GFX6-NEXT: s_add_u32 s8, s2, s8 +; GFX6-NEXT: s_addc_u32 s9, s3, 0 ; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 -; GFX6-NEXT: s_sub_u32 s6, s6, s8 -; GFX6-NEXT: s_subb_u32 s7, s7, s9 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: s_sub_u32 s2, s2, s8 +; GFX6-NEXT: s_subb_u32 s3, s3, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 0xfffff000 -; GFX9-NEXT: s_sub_u32 s2, s4, s2 -; GFX9-NEXT: s_subb_u32 s3, s5, s3 -; GFX9-NEXT: s_ashr_i32 s4, s7, 31 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_add_u32 s4, s6, s4 -; GFX9-NEXT: s_addc_u32 s5, s7, 0 +; GFX9-NEXT: s_add_u32 s4, s0, s4 +; GFX9-NEXT: s_addc_u32 s5, s1, 0 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 -; GFX9-NEXT: s_sub_u32 s4, s6, s4 -; GFX9-NEXT: s_subb_u32 s5, s7, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_sub_u32 s0, s0, s4 +; GFX9-NEXT: s_subb_u32 s1, s1, s5 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_add_u32 s4, s2, s4 +; GFX9-NEXT: s_addc_u32 s5, s3, 0 +; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GFX9-NEXT: s_sub_u32 s2, s2, s4 +; GFX9-NEXT: s_subb_u32 s3, s3, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm %r = srem <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -9355,26 +9352,26 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 -; GFX6-NEXT: s_lshl_b64 s[16:17], 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s8, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s8 -; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s1, s1, s8 -; GFX6-NEXT: s_xor_b64 s[14:15], s[0:1], s[8:9] +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s12 +; GFX6-NEXT: s_lshl_b64 s[16:17], 0x1000, s14 +; GFX6-NEXT: s_ashr_i32 s2, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s2 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s1, s1, s2 +; GFX6-NEXT: s_xor_b64 s[14:15], s[0:1], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 ; GFX6-NEXT: s_sub_u32 s0, 0, s14 ; GFX6-NEXT: s_subb_u32 s1, 0, s15 -; GFX6-NEXT: s_ashr_i32 s12, s5, 31 +; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -9423,20 +9420,20 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: s_add_u32 s0, s4, s12 +; GFX6-NEXT: s_add_u32 s0, s8, s12 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s5, s12 +; GFX6-NEXT: s_addc_u32 s1, s9, s12 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] -; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s5, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s5, v1 +; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] +; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s8, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s9, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -9448,9 +9445,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_lo_u32 v0, s14, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s14, v0 ; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] @@ -9470,11 +9467,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_add_u32 s2, s16, s0 ; GFX6-NEXT: s_mov_b32 s1, s0 ; GFX6-NEXT: s_addc_u32 s3, s17, s0 -; GFX6-NEXT: v_mov_b32_e32 v4, s5 -; GFX6-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[0:1] ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s9 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s15, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v0 @@ -9491,13 +9488,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: s_sub_u32 s0, 0, s4 +; GFX6-NEXT: s_sub_u32 s0, 0, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v2 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX6-NEXT: s_subb_u32 s1, 0, s5 +; GFX6-NEXT: s_subb_u32 s1, 0, s9 ; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 -; GFX6-NEXT: s_ashr_i32 s14, s7, 31 +; GFX6-NEXT: s_ashr_i32 s14, s11, 31 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 @@ -9540,59 +9537,59 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s0, s6, s14 +; GFX6-NEXT: s_add_u32 s0, s10, s14 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_addc_u32 s1, s7, s14 +; GFX6-NEXT: s_addc_u32 s1, s11, s14 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[14:15] -; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, s7, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s7, v3 +; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] +; GFX6-NEXT: v_mul_lo_u32 v4, s10, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s10, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, s10, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, s11, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v7, s7, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX6-NEXT: v_mul_lo_u32 v7, s11, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, s12 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, s4, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s8, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, s8, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s9, v2 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s8, v2 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, s5 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s9 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s4, v2 +; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 ; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v7 +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v6 +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, v7 -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s4, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v6, s7 +; GFX6-NEXT: v_mov_b32_e32 v6, s11 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc @@ -9602,25 +9599,24 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mov_b32_e32 v4, s14 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 -; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX9-NEXT: s_ashr_i32 s8, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s8 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s8 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[8:9] +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s12 +; GFX9-NEXT: s_lshl_b64 s[14:15], 0x1000, s14 +; GFX9-NEXT: s_ashr_i32 s2, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s2 +; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s12 ; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 @@ -9633,46 +9629,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s14, s0, s2 +; GFX9-NEXT: s_mul_i32 s4, s0, s2 ; GFX9-NEXT: s_mul_hi_u32 s16, s0, s3 -; GFX9-NEXT: s_mul_i32 s15, s1, s3 -; GFX9-NEXT: s_add_i32 s14, s16, s14 +; GFX9-NEXT: s_mul_i32 s5, s1, s3 +; GFX9-NEXT: s_add_i32 s4, s16, s4 ; GFX9-NEXT: s_mul_i32 s17, s0, s3 -; GFX9-NEXT: s_add_i32 s14, s14, s15 -; GFX9-NEXT: s_mul_hi_u32 s15, s3, s14 -; GFX9-NEXT: s_mul_i32 s16, s3, s14 +; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4 +; GFX9-NEXT: s_mul_i32 s16, s3, s4 ; GFX9-NEXT: s_mul_hi_u32 s3, s3, s17 ; GFX9-NEXT: s_add_u32 s3, s3, s16 -; GFX9-NEXT: s_addc_u32 s15, 0, s15 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 ; GFX9-NEXT: s_mul_hi_u32 s18, s2, s17 ; GFX9-NEXT: s_mul_i32 s17, s2, s17 ; GFX9-NEXT: s_add_u32 s3, s3, s17 -; GFX9-NEXT: s_mul_hi_u32 s16, s2, s14 -; GFX9-NEXT: s_addc_u32 s3, s15, s18 -; GFX9-NEXT: s_addc_u32 s15, s16, 0 -; GFX9-NEXT: s_mul_i32 s14, s2, s14 -; GFX9-NEXT: s_add_u32 s3, s3, s14 -; GFX9-NEXT: s_addc_u32 s14, 0, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s2, s4 +; GFX9-NEXT: s_addc_u32 s3, s5, s18 +; GFX9-NEXT: s_addc_u32 s5, s16, 0 +; GFX9-NEXT: s_mul_i32 s4, s2, s4 +; GFX9-NEXT: s_add_u32 s3, s3, s4 +; GFX9-NEXT: s_addc_u32 s4, 0, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s3, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s14 -; GFX9-NEXT: v_readfirstlane_b32 s14, v0 +; GFX9-NEXT: s_addc_u32 s2, s2, s4 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: s_mul_i32 s3, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s15, s0, s14 -; GFX9-NEXT: s_add_i32 s3, s15, s3 -; GFX9-NEXT: s_mul_i32 s1, s1, s14 +; GFX9-NEXT: s_mul_hi_u32 s5, s0, s4 +; GFX9-NEXT: s_add_i32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s1, s1, s4 ; GFX9-NEXT: s_add_i32 s3, s3, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s15, s2, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s4 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s0 ; GFX9-NEXT: s_mul_i32 s16, s2, s0 -; GFX9-NEXT: s_mul_i32 s18, s14, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s14, s0 -; GFX9-NEXT: s_mul_hi_u32 s17, s14, s3 +; GFX9-NEXT: s_mul_i32 s18, s4, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s4, s0 +; GFX9-NEXT: s_mul_hi_u32 s17, s4, s3 ; GFX9-NEXT: s_add_u32 s0, s0, s18 -; GFX9-NEXT: s_addc_u32 s14, 0, s17 +; GFX9-NEXT: s_addc_u32 s4, 0, s17 ; GFX9-NEXT: s_add_u32 s0, s0, s16 ; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 -; GFX9-NEXT: s_addc_u32 s0, s14, s15 +; GFX9-NEXT: s_addc_u32 s0, s4, s5 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: s_add_u32 s0, s0, s3 @@ -9680,23 +9676,23 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: s_addc_u32 s2, s2, s1 -; GFX9-NEXT: s_ashr_i32 s14, s5, 31 -; GFX9-NEXT: s_add_u32 s0, s4, s14 -; GFX9-NEXT: s_mov_b32 s15, s14 -; GFX9-NEXT: s_addc_u32 s1, s5, s14 -; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] +; GFX9-NEXT: s_ashr_i32 s16, s9, 31 +; GFX9-NEXT: s_add_u32 s0, s8, s16 +; GFX9-NEXT: s_mov_b32 s17, s16 +; GFX9-NEXT: s_addc_u32 s1, s9, s16 +; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 ; GFX9-NEXT: s_mul_i32 s1, s4, s2 -; GFX9-NEXT: s_mul_hi_u32 s15, s4, s3 +; GFX9-NEXT: s_mul_hi_u32 s8, s4, s3 ; GFX9-NEXT: s_mul_hi_u32 s0, s4, s2 -; GFX9-NEXT: s_add_u32 s1, s15, s1 +; GFX9-NEXT: s_add_u32 s1, s8, s1 ; GFX9-NEXT: s_addc_u32 s0, 0, s0 -; GFX9-NEXT: s_mul_hi_u32 s16, s5, s3 +; GFX9-NEXT: s_mul_hi_u32 s9, s5, s3 ; GFX9-NEXT: s_mul_i32 s3, s5, s3 ; GFX9-NEXT: s_add_u32 s1, s1, s3 -; GFX9-NEXT: s_mul_hi_u32 s15, s5, s2 -; GFX9-NEXT: s_addc_u32 s0, s0, s16 -; GFX9-NEXT: s_addc_u32 s1, s15, 0 +; GFX9-NEXT: s_mul_hi_u32 s8, s5, s2 +; GFX9-NEXT: s_addc_u32 s0, s0, s9 +; GFX9-NEXT: s_addc_u32 s1, s8, 0 ; GFX9-NEXT: s_mul_i32 s2, s5, s2 ; GFX9-NEXT: s_add_u32 s0, s0, s2 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 @@ -9705,19 +9701,19 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_add_i32 s1, s2, s1 ; GFX9-NEXT: s_mul_i32 s2, s13, s0 ; GFX9-NEXT: s_mul_i32 s0, s12, s0 -; GFX9-NEXT: s_add_i32 s15, s1, s2 +; GFX9-NEXT: s_add_i32 s8, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_sub_i32 s1, s5, s15 +; GFX9-NEXT: s_sub_i32 s1, s5, s8 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: s_subb_u32 s4, s1, s13 ; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s12, v0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s16, s4, 0 -; GFX9-NEXT: s_cmp_ge_u32 s16, s13 +; GFX9-NEXT: s_subb_u32 s9, s4, 0 +; GFX9-NEXT: s_cmp_ge_u32 s9, s13 ; GFX9-NEXT: s_cselect_b32 s17, -1, 0 ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v1 -; GFX9-NEXT: s_cmp_eq_u32 s16, s13 +; GFX9-NEXT: s_cmp_eq_u32 s9, s13 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -9729,11 +9725,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_subb_u32 s2, s2, 0 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s5, s15 +; GFX9-NEXT: s_subb_u32 s0, s5, s8 ; GFX9-NEXT: s_cmp_ge_u32 s0, s13 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 @@ -9743,22 +9739,22 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: s_ashr_i32 s0, s11, 31 -; GFX9-NEXT: s_add_u32 s2, s10, s0 +; GFX9-NEXT: s_ashr_i32 s0, s15, 31 +; GFX9-NEXT: s_add_u32 s2, s14, s0 ; GFX9-NEXT: s_mov_b32 s1, s0 -; GFX9-NEXT: s_addc_u32 s3, s11, s0 +; GFX9-NEXT: s_addc_u32 s3, s15, s0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s14, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, s14, v2 +; GFX9-NEXT: v_xor_b32_e32 v0, s16, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s16, v2 ; GFX9-NEXT: v_mac_f32_e32 v1, 0x4f800000, v3 ; GFX9-NEXT: v_rcp_f32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s14, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v5, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v3 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 @@ -9769,47 +9765,47 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_sub_u32 s0, 0, s4 ; GFX9-NEXT: s_subb_u32 s1, 0, s5 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s11, v3 -; GFX9-NEXT: s_mul_hi_u32 s10, s0, s2 -; GFX9-NEXT: s_mul_i32 s12, s0, s11 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 +; GFX9-NEXT: s_mul_i32 s12, s0, s9 ; GFX9-NEXT: s_mul_i32 s3, s1, s2 -; GFX9-NEXT: s_add_i32 s10, s10, s12 -; GFX9-NEXT: s_add_i32 s10, s10, s3 +; GFX9-NEXT: s_add_i32 s8, s8, s12 +; GFX9-NEXT: s_add_i32 s8, s8, s3 ; GFX9-NEXT: s_mul_i32 s13, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s3, s2, s10 -; GFX9-NEXT: s_mul_i32 s12, s2, s10 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s8 +; GFX9-NEXT: s_mul_i32 s12, s2, s8 ; GFX9-NEXT: s_mul_hi_u32 s2, s2, s13 ; GFX9-NEXT: s_add_u32 s2, s2, s12 ; GFX9-NEXT: s_addc_u32 s3, 0, s3 -; GFX9-NEXT: s_mul_hi_u32 s14, s11, s13 -; GFX9-NEXT: s_mul_i32 s13, s11, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 +; GFX9-NEXT: s_mul_i32 s13, s9, s13 ; GFX9-NEXT: s_add_u32 s2, s2, s13 -; GFX9-NEXT: s_mul_hi_u32 s12, s11, s10 +; GFX9-NEXT: s_mul_hi_u32 s12, s9, s8 ; GFX9-NEXT: s_addc_u32 s2, s3, s14 ; GFX9-NEXT: s_addc_u32 s3, s12, 0 -; GFX9-NEXT: s_mul_i32 s10, s11, s10 -; GFX9-NEXT: s_add_u32 s2, s2, s10 +; GFX9-NEXT: s_mul_i32 s8, s9, s8 +; GFX9-NEXT: s_add_u32 s2, s2, s8 ; GFX9-NEXT: s_addc_u32 s3, 0, s3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s11, s3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: s_addc_u32 s2, s9, s3 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 ; GFX9-NEXT: s_mul_i32 s3, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s11, s0, s10 -; GFX9-NEXT: s_add_i32 s3, s11, s3 -; GFX9-NEXT: s_mul_i32 s1, s1, s10 +; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8 +; GFX9-NEXT: s_add_i32 s3, s9, s3 +; GFX9-NEXT: s_mul_i32 s1, s1, s8 ; GFX9-NEXT: s_add_i32 s3, s3, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s10 -; GFX9-NEXT: s_mul_hi_u32 s11, s2, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s2, s0 ; GFX9-NEXT: s_mul_i32 s12, s2, s0 -; GFX9-NEXT: s_mul_i32 s14, s10, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s10, s0 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s3 +; GFX9-NEXT: s_mul_i32 s14, s8, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s8, s0 +; GFX9-NEXT: s_mul_hi_u32 s13, s8, s3 ; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_addc_u32 s10, 0, s13 +; GFX9-NEXT: s_addc_u32 s8, 0, s13 ; GFX9-NEXT: s_add_u32 s0, s0, s12 ; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 -; GFX9-NEXT: s_addc_u32 s0, s10, s11 +; GFX9-NEXT: s_addc_u32 s0, s8, s9 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: s_add_u32 s0, s0, s3 @@ -9817,24 +9813,24 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: s_addc_u32 s2, s2, s1 -; GFX9-NEXT: s_ashr_i32 s10, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s10 -; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX9-NEXT: s_ashr_i32 s8, s11, 31 +; GFX9-NEXT: s_add_u32 s0, s10, s8 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_addc_u32 s1, s11, s8 +; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GFX9-NEXT: v_readfirstlane_b32 s3, v2 -; GFX9-NEXT: s_mul_i32 s1, s6, s2 -; GFX9-NEXT: s_mul_hi_u32 s11, s6, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s2 -; GFX9-NEXT: s_add_u32 s1, s11, s1 +; GFX9-NEXT: s_mul_i32 s1, s10, s2 +; GFX9-NEXT: s_mul_hi_u32 s9, s10, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s10, s2 +; GFX9-NEXT: s_add_u32 s1, s9, s1 ; GFX9-NEXT: s_addc_u32 s0, 0, s0 -; GFX9-NEXT: s_mul_hi_u32 s12, s7, s3 -; GFX9-NEXT: s_mul_i32 s3, s7, s3 +; GFX9-NEXT: s_mul_hi_u32 s12, s11, s3 +; GFX9-NEXT: s_mul_i32 s3, s11, s3 ; GFX9-NEXT: s_add_u32 s1, s1, s3 -; GFX9-NEXT: s_mul_hi_u32 s11, s7, s2 +; GFX9-NEXT: s_mul_hi_u32 s9, s11, s2 ; GFX9-NEXT: s_addc_u32 s0, s0, s12 -; GFX9-NEXT: s_addc_u32 s1, s11, 0 -; GFX9-NEXT: s_mul_i32 s2, s7, s2 +; GFX9-NEXT: s_addc_u32 s1, s9, 0 +; GFX9-NEXT: s_mul_i32 s2, s11, s2 ; GFX9-NEXT: s_add_u32 s0, s0, s2 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 ; GFX9-NEXT: s_mul_i32 s1, s4, s1 @@ -9842,15 +9838,15 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_add_i32 s1, s2, s1 ; GFX9-NEXT: s_mul_i32 s2, s5, s0 ; GFX9-NEXT: s_mul_i32 s0, s4, s0 -; GFX9-NEXT: s_add_i32 s11, s1, s2 +; GFX9-NEXT: s_add_i32 s9, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: s_sub_i32 s1, s7, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 +; GFX9-NEXT: s_sub_i32 s1, s11, s9 +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s6, s1, s5 +; GFX9-NEXT: s_subb_u32 s10, s1, s5 ; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s4, v2 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s12, s6, 0 +; GFX9-NEXT: s_subb_u32 s12, s10, 0 ; GFX9-NEXT: s_cmp_ge_u32 s12, s5 ; GFX9-NEXT: s_cselect_b32 s13, -1, 0 ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v3 @@ -9860,7 +9856,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3] -; GFX9-NEXT: s_subb_u32 s2, s6, s5 +; GFX9-NEXT: s_subb_u32 s2, s10, s5 ; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v3 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s2, s2, 0 @@ -9870,7 +9866,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mov_b32_e32 v6, s2 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s7, s11 +; GFX9-NEXT: s_subb_u32 s0, s11, s9 ; GFX9-NEXT: s_cmp_ge_u32 s0, s5 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 @@ -9883,13 +9879,12 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mov_b32_e32 v7, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 -; GFX9-NEXT: v_xor_b32_e32 v3, s10, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s10, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, s8, v5 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = srem <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll index aa38f43368694d..ba5f9b7aa0d06d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll @@ -7,16 +7,16 @@ define void @uniform_br_no_metadata(i32 noundef inreg %value, ptr addrspace(8) n ; GFX9-LABEL: uniform_br_no_metadata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_cmp_lt_i32 s21, 1 +; GFX9-NEXT: s_cmp_lt_i32 s23, 1 ; GFX9-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; %if.then -; GFX9-NEXT: s_mov_b32 s11, s18 -; GFX9-NEXT: s_mov_b32 s10, s17 -; GFX9-NEXT: s_mov_b32 s9, s16 -; GFX9-NEXT: s_mov_b32 s8, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX9-NEXT: s_mov_b32 s7, s20 +; GFX9-NEXT: s_mov_b32 s6, s19 +; GFX9-NEXT: s_mov_b32 s5, s18 +; GFX9-NEXT: s_mov_b32 s4, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX9-NEXT: .LBB0_2: ; %if.end ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -24,16 +24,16 @@ define void @uniform_br_no_metadata(i32 noundef inreg %value, ptr addrspace(8) n ; GFX10-LABEL: uniform_br_no_metadata: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_cmp_lt_i32 s21, 1 +; GFX10-NEXT: s_cmp_lt_i32 s23, 1 ; GFX10-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX10-NEXT: ; %bb.1: ; %if.then -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s19 -; GFX10-NEXT: s_mov_b32 s11, s18 -; GFX10-NEXT: s_mov_b32 s10, s17 -; GFX10-NEXT: s_mov_b32 s9, s16 -; GFX10-NEXT: s_mov_b32 s8, s7 -; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX10-NEXT: v_mov_b32_e32 v0, s16 +; GFX10-NEXT: v_mov_b32_e32 v1, s21 +; GFX10-NEXT: s_mov_b32 s7, s20 +; GFX10-NEXT: s_mov_b32 s6, s19 +; GFX10-NEXT: s_mov_b32 s5, s18 +; GFX10-NEXT: s_mov_b32 s4, s17 +; GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX10-NEXT: .LBB0_2: ; %if.end ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -54,16 +54,16 @@ define void @uniform_br_unprofitable(i32 noundef inreg %value, ptr addrspace(8) ; GFX9-LABEL: uniform_br_unprofitable: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_cmp_lt_i32 s21, 1 +; GFX9-NEXT: s_cmp_lt_i32 s23, 1 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; %if.then -; GFX9-NEXT: s_mov_b32 s11, s18 -; GFX9-NEXT: s_mov_b32 s10, s17 -; GFX9-NEXT: s_mov_b32 s9, s16 -; GFX9-NEXT: s_mov_b32 s8, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX9-NEXT: s_mov_b32 s7, s20 +; GFX9-NEXT: s_mov_b32 s6, s19 +; GFX9-NEXT: s_mov_b32 s5, s18 +; GFX9-NEXT: s_mov_b32 s4, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX9-NEXT: .LBB1_2: ; %if.end ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -71,16 +71,16 @@ define void @uniform_br_unprofitable(i32 noundef inreg %value, ptr addrspace(8) ; GFX10-LABEL: uniform_br_unprofitable: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_cmp_lt_i32 s21, 1 +; GFX10-NEXT: s_cmp_lt_i32 s23, 1 ; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %if.then -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s19 -; GFX10-NEXT: s_mov_b32 s11, s18 -; GFX10-NEXT: s_mov_b32 s10, s17 -; GFX10-NEXT: s_mov_b32 s9, s16 -; GFX10-NEXT: s_mov_b32 s8, s7 -; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX10-NEXT: v_mov_b32_e32 v0, s16 +; GFX10-NEXT: v_mov_b32_e32 v1, s21 +; GFX10-NEXT: s_mov_b32 s7, s20 +; GFX10-NEXT: s_mov_b32 s6, s19 +; GFX10-NEXT: s_mov_b32 s5, s18 +; GFX10-NEXT: s_mov_b32 s4, s17 +; GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX10-NEXT: .LBB1_2: ; %if.end ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -101,16 +101,16 @@ define void @uniform_br_profitable(i32 noundef inreg %value, ptr addrspace(8) no ; GFX9-LABEL: uniform_br_profitable: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_cmp_lt_i32 s21, 1 +; GFX9-NEXT: s_cmp_lt_i32 s23, 1 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %if.then -; GFX9-NEXT: s_mov_b32 s11, s18 -; GFX9-NEXT: s_mov_b32 s10, s17 -; GFX9-NEXT: s_mov_b32 s9, s16 -; GFX9-NEXT: s_mov_b32 s8, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX9-NEXT: s_mov_b32 s7, s20 +; GFX9-NEXT: s_mov_b32 s6, s19 +; GFX9-NEXT: s_mov_b32 s5, s18 +; GFX9-NEXT: s_mov_b32 s4, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX9-NEXT: .LBB2_2: ; %if.end ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -118,16 +118,16 @@ define void @uniform_br_profitable(i32 noundef inreg %value, ptr addrspace(8) no ; GFX10-LABEL: uniform_br_profitable: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_cmp_lt_i32 s21, 1 +; GFX10-NEXT: s_cmp_lt_i32 s23, 1 ; GFX10-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX10-NEXT: ; %bb.1: ; %if.then -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s19 -; GFX10-NEXT: s_mov_b32 s11, s18 -; GFX10-NEXT: s_mov_b32 s10, s17 -; GFX10-NEXT: s_mov_b32 s9, s16 -; GFX10-NEXT: s_mov_b32 s8, s7 -; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX10-NEXT: v_mov_b32_e32 v0, s16 +; GFX10-NEXT: v_mov_b32_e32 v1, s21 +; GFX10-NEXT: s_mov_b32 s7, s20 +; GFX10-NEXT: s_mov_b32 s6, s19 +; GFX10-NEXT: s_mov_b32 s5, s18 +; GFX10-NEXT: s_mov_b32 s4, s17 +; GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX10-NEXT: .LBB2_2: ; %if.end ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -149,18 +149,18 @@ define void @divergent_br_no_metadata(i32 noundef inreg %value, ptr addrspace(8) ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_2 ; GFX9-NEXT: ; %bb.1: ; %if.then -; GFX9-NEXT: s_mov_b32 s11, s18 -; GFX9-NEXT: s_mov_b32 s10, s17 -; GFX9-NEXT: s_mov_b32 s9, s16 -; GFX9-NEXT: s_mov_b32 s8, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX9-NEXT: s_mov_b32 s7, s20 +; GFX9-NEXT: s_mov_b32 s6, s19 +; GFX9-NEXT: s_mov_b32 s5, s18 +; GFX9-NEXT: s_mov_b32 s4, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX9-NEXT: .LBB3_2: ; %if.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -168,38 +168,38 @@ define void @divergent_br_no_metadata(i32 noundef inreg %value, ptr addrspace(8) ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v0 -; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1010-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1010-NEXT: s_cbranch_execz .LBB3_2 ; GFX1010-NEXT: ; %bb.1: ; %if.then -; GFX1010-NEXT: v_mov_b32_e32 v0, s6 -; GFX1010-NEXT: v_mov_b32_e32 v1, s19 -; GFX1010-NEXT: s_mov_b32 s11, s18 -; GFX1010-NEXT: s_mov_b32 s10, s17 -; GFX1010-NEXT: s_mov_b32 s9, s16 -; GFX1010-NEXT: s_mov_b32 s8, s7 -; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX1010-NEXT: v_mov_b32_e32 v0, s16 +; GFX1010-NEXT: v_mov_b32_e32 v1, s21 +; GFX1010-NEXT: s_mov_b32 s7, s20 +; GFX1010-NEXT: s_mov_b32 s6, s19 +; GFX1010-NEXT: s_mov_b32 s5, s18 +; GFX1010-NEXT: s_mov_b32 s4, s17 +; GFX1010-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX1010-NEXT: .LBB3_2: ; %if.end ; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1010-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: divergent_br_no_metadata: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: s_mov_b32 s4, exec_lo +; GFX1030-NEXT: s_mov_b32 s8, exec_lo ; GFX1030-NEXT: v_cmpx_lt_i32_e32 0, v0 ; GFX1030-NEXT: s_cbranch_execz .LBB3_2 ; GFX1030-NEXT: ; %bb.1: ; %if.then -; GFX1030-NEXT: v_mov_b32_e32 v0, s6 -; GFX1030-NEXT: v_mov_b32_e32 v1, s19 -; GFX1030-NEXT: s_mov_b32 s11, s18 -; GFX1030-NEXT: s_mov_b32 s10, s17 -; GFX1030-NEXT: s_mov_b32 s9, s16 -; GFX1030-NEXT: s_mov_b32 s8, s7 -; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX1030-NEXT: v_mov_b32_e32 v0, s16 +; GFX1030-NEXT: v_mov_b32_e32 v1, s21 +; GFX1030-NEXT: s_mov_b32 s7, s20 +; GFX1030-NEXT: s_mov_b32 s6, s19 +; GFX1030-NEXT: s_mov_b32 s5, s18 +; GFX1030-NEXT: s_mov_b32 s4, s17 +; GFX1030-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX1030-NEXT: .LBB3_2: ; %if.end -; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: @@ -220,18 +220,18 @@ define void @divergent_br_unprofitable(i32 noundef inreg %value, ptr addrspace(8 ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %if.then -; GFX9-NEXT: s_mov_b32 s11, s18 -; GFX9-NEXT: s_mov_b32 s10, s17 -; GFX9-NEXT: s_mov_b32 s9, s16 -; GFX9-NEXT: s_mov_b32 s8, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX9-NEXT: s_mov_b32 s7, s20 +; GFX9-NEXT: s_mov_b32 s6, s19 +; GFX9-NEXT: s_mov_b32 s5, s18 +; GFX9-NEXT: s_mov_b32 s4, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX9-NEXT: .LBB4_2: ; %if.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -239,38 +239,38 @@ define void @divergent_br_unprofitable(i32 noundef inreg %value, ptr addrspace(8 ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v0 -; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1010-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1010-NEXT: s_cbranch_execz .LBB4_2 ; GFX1010-NEXT: ; %bb.1: ; %if.then -; GFX1010-NEXT: v_mov_b32_e32 v0, s6 -; GFX1010-NEXT: v_mov_b32_e32 v1, s19 -; GFX1010-NEXT: s_mov_b32 s11, s18 -; GFX1010-NEXT: s_mov_b32 s10, s17 -; GFX1010-NEXT: s_mov_b32 s9, s16 -; GFX1010-NEXT: s_mov_b32 s8, s7 -; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX1010-NEXT: v_mov_b32_e32 v0, s16 +; GFX1010-NEXT: v_mov_b32_e32 v1, s21 +; GFX1010-NEXT: s_mov_b32 s7, s20 +; GFX1010-NEXT: s_mov_b32 s6, s19 +; GFX1010-NEXT: s_mov_b32 s5, s18 +; GFX1010-NEXT: s_mov_b32 s4, s17 +; GFX1010-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX1010-NEXT: .LBB4_2: ; %if.end ; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1010-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: divergent_br_unprofitable: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: s_mov_b32 s4, exec_lo +; GFX1030-NEXT: s_mov_b32 s8, exec_lo ; GFX1030-NEXT: v_cmpx_lt_i32_e32 0, v0 ; GFX1030-NEXT: s_cbranch_execz .LBB4_2 ; GFX1030-NEXT: ; %bb.1: ; %if.then -; GFX1030-NEXT: v_mov_b32_e32 v0, s6 -; GFX1030-NEXT: v_mov_b32_e32 v1, s19 -; GFX1030-NEXT: s_mov_b32 s11, s18 -; GFX1030-NEXT: s_mov_b32 s10, s17 -; GFX1030-NEXT: s_mov_b32 s9, s16 -; GFX1030-NEXT: s_mov_b32 s8, s7 -; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX1030-NEXT: v_mov_b32_e32 v0, s16 +; GFX1030-NEXT: v_mov_b32_e32 v1, s21 +; GFX1030-NEXT: s_mov_b32 s7, s20 +; GFX1030-NEXT: s_mov_b32 s6, s19 +; GFX1030-NEXT: s_mov_b32 s5, s18 +; GFX1030-NEXT: s_mov_b32 s4, s17 +; GFX1030-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX1030-NEXT: .LBB4_2: ; %if.end -; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: @@ -291,17 +291,17 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8) ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9-NEXT: ; %bb.1: ; %if.then -; GFX9-NEXT: s_mov_b32 s11, s18 -; GFX9-NEXT: s_mov_b32 s10, s17 -; GFX9-NEXT: s_mov_b32 s9, s16 -; GFX9-NEXT: s_mov_b32 s8, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX9-NEXT: s_mov_b32 s7, s20 +; GFX9-NEXT: s_mov_b32 s6, s19 +; GFX9-NEXT: s_mov_b32 s5, s18 +; GFX9-NEXT: s_mov_b32 s4, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX9-NEXT: ; %bb.2: ; %if.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -309,36 +309,36 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8) ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v0 -; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1010-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1010-NEXT: ; %bb.1: ; %if.then -; GFX1010-NEXT: v_mov_b32_e32 v0, s6 -; GFX1010-NEXT: v_mov_b32_e32 v1, s19 -; GFX1010-NEXT: s_mov_b32 s11, s18 -; GFX1010-NEXT: s_mov_b32 s10, s17 -; GFX1010-NEXT: s_mov_b32 s9, s16 -; GFX1010-NEXT: s_mov_b32 s8, s7 -; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX1010-NEXT: v_mov_b32_e32 v0, s16 +; GFX1010-NEXT: v_mov_b32_e32 v1, s21 +; GFX1010-NEXT: s_mov_b32 s7, s20 +; GFX1010-NEXT: s_mov_b32 s6, s19 +; GFX1010-NEXT: s_mov_b32 s5, s18 +; GFX1010-NEXT: s_mov_b32 s4, s17 +; GFX1010-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX1010-NEXT: ; %bb.2: ; %if.end ; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1010-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: divergent_br_profitable: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: s_mov_b32 s4, exec_lo +; GFX1030-NEXT: s_mov_b32 s8, exec_lo ; GFX1030-NEXT: v_cmpx_lt_i32_e32 0, v0 ; GFX1030-NEXT: ; %bb.1: ; %if.then -; GFX1030-NEXT: v_mov_b32_e32 v0, s6 -; GFX1030-NEXT: v_mov_b32_e32 v1, s19 -; GFX1030-NEXT: s_mov_b32 s11, s18 -; GFX1030-NEXT: s_mov_b32 s10, s17 -; GFX1030-NEXT: s_mov_b32 s9, s16 -; GFX1030-NEXT: s_mov_b32 s8, s7 -; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX1030-NEXT: v_mov_b32_e32 v0, s16 +; GFX1030-NEXT: v_mov_b32_e32 v1, s21 +; GFX1030-NEXT: s_mov_b32 s7, s20 +; GFX1030-NEXT: s_mov_b32 s6, s19 +; GFX1030-NEXT: s_mov_b32 s5, s18 +; GFX1030-NEXT: s_mov_b32 s4, s17 +; GFX1030-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; GFX1030-NEXT: ; %bb.2: ; %if.end -; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll index 52e76dd24a20b4..ce2b84ebdacb12 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll @@ -6,7 +6,7 @@ define weak_odr amdgpu_kernel void @test_mul24_knownbits_kernel(ptr addrspace(1) ; GCN-LABEL: test_mul24_knownbits_kernel: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_and_b32_e32 v0, 3, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: v_mul_i32_i24_e32 v0, -5, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffffffe0, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index 0025d23b108038..9d0d85da9f7fa6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -146,14 +146,14 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: v_writelane_b32 v43, s45, 13 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] ; CHECK-NEXT: v_mov_b32_e32 v41, v2 ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 ; CHECK-NEXT: s_mov_b32 s44, s13 ; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] -; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 @@ -163,9 +163,9 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] -; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] ; CHECK-NEXT: s_mov_b32 s12, s45 ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 @@ -285,15 +285,15 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_writelane_b32 v43, s45, 13 ; CHECK-NEXT: v_mov_b32_e32 v42, v31 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] ; CHECK-NEXT: v_mov_b32_e32 v41, v3 ; CHECK-NEXT: v_mov_b32_e32 v40, v2 ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 ; CHECK-NEXT: s_mov_b32 s44, s13 ; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] -; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mul_f64 v[0:1], v[40:41], v[0:1] @@ -302,9 +302,9 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] -; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] ; CHECK-NEXT: s_mov_b32 s12, s45 ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 @@ -430,14 +430,14 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: v_writelane_b32 v43, s45, 13 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] ; CHECK-NEXT: v_mov_b32_e32 v41, v2 ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 ; CHECK-NEXT: s_mov_b32 s44, s13 ; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] -; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 @@ -447,9 +447,9 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] -; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] ; CHECK-NEXT: s_mov_b32 s12, s45 ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 @@ -571,13 +571,13 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_writelane_b32 v42, s45, 13 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 ; CHECK-NEXT: s_mov_b32 s44, s13 ; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] -; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: v_lshlrev_b32_e32 v41, 1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -588,9 +588,9 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] -; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] ; CHECK-NEXT: s_mov_b32 s12, s45 ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 @@ -715,13 +715,13 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: v_writelane_b32 v43, s45, 13 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 ; CHECK-NEXT: s_mov_b32 s44, s13 ; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] -; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: v_or_b32_e32 v42, 1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -732,9 +732,9 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] -; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] ; CHECK-NEXT: s_mov_b32 s12, s45 ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll index 8cda553e61c8ad..ad646a36a74430 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll @@ -9,8 +9,8 @@ ; Legacy intrinsics that just read implicit parameters ; FUNC-LABEL: {{^}}ngroups_x: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x0 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x0 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x0 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x0 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -24,8 +24,8 @@ entry: } ; FUNC-LABEL: {{^}}ngroups_y: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x1 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x4 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x1 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x4 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -39,8 +39,8 @@ entry: } ; FUNC-LABEL: {{^}}ngroups_z: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x2 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x8 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x2 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x8 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -54,8 +54,8 @@ entry: } ; FUNC-LABEL: {{^}}global_size_x: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x3 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0xc +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x3 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0xc ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -69,8 +69,8 @@ entry: } ; FUNC-LABEL: {{^}}global_size_y: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x4 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x10 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x4 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x10 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -84,8 +84,8 @@ entry: } ; FUNC-LABEL: {{^}}global_size_z: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x5 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x14 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x5 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x14 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -99,8 +99,8 @@ entry: } ; FUNC-LABEL: {{^}}local_size_x: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x6 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x18 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x6 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x18 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -114,8 +114,8 @@ entry: } ; FUNC-LABEL: {{^}}local_size_y: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x7 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x1c +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x7 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x1c ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -129,8 +129,8 @@ entry: } ; FUNC-LABEL: {{^}}local_size_z: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x8 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x20 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x8 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[4:5], 0x20 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll index 122fc42ef9b62a..96d7f02cf2422a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll @@ -6,13 +6,13 @@ ; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ................ ; OBJDUMP-NOT: 0030 0000af00 94130000 1a000400 00000000 ................ -; OBJDUMP-NEXT: 0030 4000af00 94130000 1a000400 00000000 @............... +; OBJDUMP-NEXT: 0030 8000af00 98130000 1a000400 00000000 ................ ; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6: -; ASM: .amdhsa_user_sgpr_count 10 -; ASM: .amdhsa_next_free_sgpr 10 -; ASM: ; TotalNumSgprs: 16 -; ASM: ; NumSGPRsForWavesPerEU: 16 +; ASM: .amdhsa_user_sgpr_count 12 +; ASM: .amdhsa_next_free_sgpr 12 +; ASM: ; TotalNumSgprs: 18 +; ASM: ; NumSGPRsForWavesPerEU: 18 ; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT ; feild that are not explicitly referenced in the kernel. This test has 6 implicit diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.o b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.o new file mode 100644 index 0000000000000000000000000000000000000000..9805d7434b77ba8ff08719a50f0dda1697d61736 GIT binary patch literal 11280 zcmeHN-;3Nt9G|Kl@sXj%}R-sA}3ck(FW@mT9W;bh+-QHEO zCoMh+mC}bki6FH?wLh$u77G=@Hw%4ip+fOLP<+?9MtGW&8jAg6xN^biqo9Xj%kR{_t}a4$ zl%XmdHKC%uZ~CdIi-Bx(1XFtbJs+Z_mdWnLtiUP>zfuhN^ z=15&{S#zYe!zH;~PPIjT4c7d2oDKH;fF7D*oOmb*V6|7T<0#`aOE@<~5A9x%_zl`k zyx8#|xi2zWw}f_4(3%)^VSi7CT`0k(cSa4@5$Oef(8E2PF@9YtpE&WQz-|)VaKyyNjW+HiuLM$q8gloKFP!P`~>d%(dkBh zA0;^JR}po=i8FWFe1#vxU7rjkH>~qcyx6XW!M)uUJvpt~Ri_u@7}gevJl}(Hg1pwx zWb$X7fGhk!?c|bk6!mdE3dsedxw~bi`FS=HBzcOOR`02zWlnR{GAvVg!BB?0>t3|% zL*8?*aE85QI(7u9Y1WP)CG9OC46@8QE4&o6&k8T~*jt(WwG&omVY5=#`&^d=M{1id z$tCpw=4I3}AEF0LufhZNOApvC2d!Ln`{`fn8W;+WM<7Zj9s2IOvz`wVio+#oc4E=6lFQAL~Lo9=0d=q&OUDp2F zG+ff}F9!YNhWK%V{?`op4;l1-WYB-Wpuca>|EEFcoFV>=L4Vhv|D-|xWrO|?4f?+{ z=zrGGPTipYo9&>PnO*==`~=C1w!b9WP4TBvJj>%It_^=Y+rNlo| z;?FDbJ4*ZoB@UbfFC@U>HC>06E=VE9Z}-|U;GQ$E-r{t6ZAc3qCb0InGz_e*tA>F! z10DueYeZCY6-TsGwG83I8F}@?8Tk~3GxB*rT5q?$_JhEA^#fbZ3m*v9G&fNUBkc9q z@0OKtD~6`tilL=o#VF)K3V*DGMVkIW3zt>w!q zovb_?dl5>!rd77qEG<`SmX@hCOUp|s7xT)s!SpmAvn6R+*^;!R3M4jmrfaWd)~cCc zS}taorrr$G^qOIsHa1p$&$CiE46NxK2G-POU?)zZ1XTfj0l>E8)dF&&u!a(3(x@h6 z!l+l;gi+<2Fsf?tL%zz*da@3Ys~_3e)K0#Cm7G^UV%e01=rR3@WlR=N|Es9`m<+MK zTmw&$_Gw72%sAeCT<~&uSA^5eR>G%ir+obS4vv`T8}{~`=9#`lpAvDJUakvv#@#3Y zxlJDX7ilyfqZC{IWwM~78VdP~{=Xpa+SO0GPE2D@5OSVN&pmjWBkqd9r5bmTZ$AHj E01Umu)Bpeg literal 0 HcmV?d00001 diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll index 74a7ca243b56ad..f52ba7000edeb9 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll @@ -23,7 +23,7 @@ ; ELF: Section: .text (0x2) ; ELF: } -; GFX10: NumSGPRsForWavesPerEU: 4 +; GFX10: NumSGPRsForWavesPerEU: 6 ; GFX10: NumVGPRsForWavesPerEU: 1 define amdgpu_kernel void @simple(ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll index 0fa58f3c444a54..13daedf9872297 100644 --- a/llvm/test/CodeGen/AMDGPU/andorbitset.ll +++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll @@ -4,12 +4,12 @@ define amdgpu_kernel void @s_clear_msb(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_clear_msb: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: s_and_b32 s4, s6, 0x7fffffff ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -21,12 +21,12 @@ define amdgpu_kernel void @s_clear_msb(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_set_msb(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_set_msb: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_or_b32 s4, s6, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -38,12 +38,12 @@ define amdgpu_kernel void @s_set_msb(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_clear_lsb(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_clear_lsb: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, -2 +; SI-NEXT: s_and_b32 s4, s6, -2 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -55,12 +55,12 @@ define amdgpu_kernel void @s_clear_lsb(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_set_lsb(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_set_lsb: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_or_b32 s4, s4, 1 +; SI-NEXT: s_or_b32 s4, s6, 1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -72,12 +72,12 @@ define amdgpu_kernel void @s_set_lsb(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_clear_midbit(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_clear_midbit: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s4, 8 +; SI-NEXT: s_and_b32 s4, s6, 0xfffffeff ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -89,12 +89,12 @@ define amdgpu_kernel void @s_clear_midbit(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_set_midbit: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 8 +; SI-NEXT: s_or_b32 s4, s6, 0x100 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll index 4b56b5e9d24f5c..e68a2cdc0b8461 100644 --- a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll +++ b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll @@ -4,12 +4,12 @@ define amdgpu_kernel void @s_or_to_orn2(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_or_to_orn2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_orn2_b32 s4, s4, 50 +; SI-NEXT: s_or_b32 s4, s6, 0xffffffcd ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -21,12 +21,12 @@ define amdgpu_kernel void @s_or_to_orn2(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_or_to_orn2_imm0(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_or_to_orn2_imm0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_orn2_b32 s4, s4, 50 +; SI-NEXT: s_or_b32 s4, s6, 0xffffffcd ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -38,12 +38,12 @@ define amdgpu_kernel void @s_or_to_orn2_imm0(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_and_to_andn2(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_and_to_andn2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_andn2_b32 s4, s4, 50 +; SI-NEXT: s_and_b32 s4, s6, 0xffffffcd ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -55,12 +55,12 @@ define amdgpu_kernel void @s_and_to_andn2(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_and_to_andn2_imm0(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_and_to_andn2_imm0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_andn2_b32 s4, s4, 50 +; SI-NEXT: s_and_b32 s4, s6, 0xffffffcd ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -72,12 +72,12 @@ define amdgpu_kernel void @s_and_to_andn2_imm0(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_xor_to_xnor(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_xor_to_xnor: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xnor_b32 s4, s4, 50 +; SI-NEXT: s_xor_b32 s4, s6, 0xffffffcd ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -89,12 +89,12 @@ define amdgpu_kernel void @s_xor_to_xnor(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_xor_to_xnor_imm0(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_xor_to_xnor_imm0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xnor_b32 s4, s4, 50 +; SI-NEXT: s_xor_b32 s4, s6, 0xffffffcd ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll index 8b6c8be9f37882..6e16c900332736 100644 --- a/llvm/test/CodeGen/AMDGPU/anyext.ll +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -9,12 +9,12 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; GCN-LABEL: anyext_i1_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -22,12 +22,12 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; ; GFX8-LABEL: anyext_i1_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX8-NEXT: v_not_b32_e32 v0, v0 @@ -37,12 +37,12 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; ; GFX9-LABEL: anyext_i1_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX9-NEXT: v_not_b32_e32 v0, v0 @@ -62,23 +62,23 @@ entry: define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { ; GCN-LABEL: s_anyext_i16_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s14, 0 ; GCN-NEXT: s_mov_b32 s15, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[12:13], s[6:7] +; GCN-NEXT: s_mov_b64 s[12:13], s[2:3] ; GCN-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v1 -; GCN-NEXT: s_mov_b64 s[2:3], s[14:15] +; GCN-NEXT: s_mov_b64 s[6:7], s[14:15] ; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[12:15], 0 addr64 -; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 +; GCN-NEXT: s_mov_b32 s8, s0 +; GCN-NEXT: s_mov_b32 s9, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_not_b32_e32 v0, v0 @@ -88,46 +88,46 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: s_anyext_i16_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_ushort v2, v[2:3] ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_u16_e32 v0, v2, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_anyext_i16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v3, v1, s[0:1] -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v3, v1, s[6:7] +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, v2, v3 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %tid.x = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index ad0babd74f9c5e..a9e092fa39fbe7 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -19,23 +19,23 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32, ptr addrspace(8), i32, i define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX6-NEXT: s_mul_i32 s4, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_mul_i32 s2, s2, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -47,23 +47,23 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: add_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 @@ -75,23 +75,23 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -102,26 +102,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W64-LABEL: add_i32_constant: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX10W64-NEXT: s_mul_i32 s2, s2, 5 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -138,18 +137,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc +; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -159,25 +157,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s2, 5 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -197,16 +195,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -218,27 +216,27 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W64-LABEL: add_i32_constant: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[4:5], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX12W64-NEXT: s_mul_i32 s2, s2, 5 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -258,7 +256,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -266,11 +264,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -288,24 +286,24 @@ entry: define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) { ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x11 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s4, s6, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_mul_i32 s2, s6, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -318,24 +316,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s6, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -348,24 +346,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -377,27 +375,26 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -407,55 +404,55 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 -; GFX10W32-NEXT: s_mov_b32 s4, exec_lo +; GFX10W32-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX10W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W32-NEXT: s_mul_i32 s2, s0, s2 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_load_b32 s6, s[4:5], 0x44 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -467,25 +464,25 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 -; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: s_load_b32 s0, s[4:5], 0x44 +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s2, s0, s2 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) @@ -497,28 +494,28 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W64-LABEL: add_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 -; GFX12W64-NEXT: s_mov_b64 s[4:5], exec +; GFX12W64-NEXT: s_load_b32 s6, s[4:5], 0x44 +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX12W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -530,31 +527,30 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: add_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 -; GFX12W32-NEXT: s_mov_b32 s4, exec_lo +; GFX12W32-NEXT: s_load_b32 s0, s[4:5], 0x44 +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s2, s0, s2 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -570,19 +566,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX6-NEXT: s_mov_b32 m0, s5 -; GFX6-NEXT: v_readlane_b32 s8, v0, s5 -; GFX6-NEXT: v_writelane_b32 v1, s4, m0 -; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s3 +; GFX6-NEXT: v_readlane_b32 s8, v0, s3 +; GFX6-NEXT: v_writelane_b32 v1, s2, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_add_i32 s2, s2, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -593,13 +589,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB2_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB2_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -613,16 +609,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 @@ -635,13 +631,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -654,16 +650,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9-NEXT: s_mov_b32 m0, s3 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 @@ -676,13 +672,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -694,16 +690,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec -; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 ; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -715,16 +711,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -739,12 +734,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 -; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 ; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -755,16 +750,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -776,18 +770,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -801,13 +795,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -825,14 +819,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -844,13 +838,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -863,18 +857,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: s_mov_b32 s4, 0 +; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -888,17 +882,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -914,14 +909,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 @@ -935,14 +930,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -961,19 +956,19 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX6-LABEL: struct_add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB3_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX6-NEXT: s_mov_b32 m0, s5 -; GFX6-NEXT: v_readlane_b32 s8, v0, s5 -; GFX6-NEXT: v_writelane_b32 v1, s4, m0 -; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s3 +; GFX6-NEXT: v_readlane_b32 s8, v0, s3 +; GFX6-NEXT: v_writelane_b32 v1, s2, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_add_i32 s2, s2, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -984,15 +979,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB3_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dword s5, s[2:3], 0x11 -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_load_dword s3, s[4:5], 0x11 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s3 ; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB3_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1006,16 +1001,16 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX8-LABEL: struct_add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1028,15 +1023,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB3_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dword s5, s[2:3], 0x44 -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_load_dword s3, s[4:5], 0x44 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB3_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -1049,16 +1044,16 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX9-LABEL: struct_add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9-NEXT: s_mov_b32 m0, s3 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1071,15 +1066,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB3_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1091,16 +1086,16 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W64-LABEL: struct_add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec -; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 ; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -1113,18 +1108,17 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_clause 0x1 -; GFX10W64-NEXT: s_load_dword s5, s[2:3], 0x44 -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: s_load_dword s3, s[4:5], 0x44 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mov_b32_e32 v2, s5 +; GFX10W64-NEXT: v_mov_b32_e32 v2, s3 ; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB3_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -1139,12 +1133,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 -; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 ; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -1156,18 +1150,17 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX10W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_clause 0x1 -; GFX10W32-NEXT: s_load_dword s8, s[2:3], 0x44 -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dword s2, s[4:5], 0x44 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mov_b32_e32 v2, s8 -; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: v_mov_b32_e32 v2, s2 +; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB3_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -1179,18 +1172,18 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -1205,15 +1198,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b32 s5, s[2:3], 0x44 -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b32 s3, s[4:5], 0x44 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s5 +; GFX11W64-NEXT: v_mov_b32_e32 v2, s3 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB3_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1231,14 +1224,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -1251,14 +1244,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX11W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b32 s8, s[2:3], 0x44 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b32 s2, s[4:5], 0x44 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8 -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s2 +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB3_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1271,18 +1264,18 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: s_mov_b32 s4, 0 +; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -1297,19 +1290,20 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b32 s5, s[2:3], 0x44 -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b32 s3, s[4:5], 0x44 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: v_mov_b32_e32 v2, s5 +; GFX12W64-NEXT: v_mov_b32_e32 v2, s3 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB3_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1325,14 +1319,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1347,15 +1341,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b32 s8, s[2:3], 0x44 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b32 s2, s[4:5], 0x44 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s2 +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB3_4: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1373,24 +1367,25 @@ entry: define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 offen glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_add v2, v0, s[0:3], 0 offen glc +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1399,80 +1394,78 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc +; GFX9-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 offen glc +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_offset: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], 0 offen glc +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_offset: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], 0 offen glc +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_offset: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_offset: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_endpgm entry: @@ -1485,23 +1478,23 @@ entry: define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX6-NEXT: s_mul_i32 s4, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_mul_i32 s2, s2, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1514,23 +1507,23 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1543,23 +1536,23 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1571,26 +1564,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W64-LABEL: sub_i32_constant: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX10W64-NEXT: s_mul_i32 s2, s2, 5 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1608,18 +1600,17 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc +; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1630,25 +1621,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s2, 5 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1669,16 +1660,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1691,27 +1682,27 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W64-LABEL: sub_i32_constant: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[4:5], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX12W64-NEXT: s_mul_i32 s2, s2, 5 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1732,7 +1723,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -1740,11 +1731,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1763,24 +1754,24 @@ entry: define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) { ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x11 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB6_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s4, s6, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_mul_i32 s2, s6, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB6_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1793,24 +1784,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s6, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1823,24 +1814,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1852,25 +1843,25 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) @@ -1882,55 +1873,55 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 -; GFX10W32-NEXT: s_mov_b32 s4, exec_lo +; GFX10W32-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX10W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W32-NEXT: s_mul_i32 s2, s0, s2 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_load_b32 s6, s[4:5], 0x44 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) @@ -1943,25 +1934,25 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 -; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: s_load_b32 s0, s[4:5], 0x44 +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s2, s0, s2 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB6_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) @@ -1974,28 +1965,28 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W64-LABEL: sub_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 -; GFX12W64-NEXT: s_mov_b64 s[4:5], exec +; GFX12W64-NEXT: s_load_b32 s6, s[4:5], 0x44 +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX12W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -2008,27 +1999,27 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: sub_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 -; GFX12W32-NEXT: s_mov_b32 s4, exec_lo +; GFX12W32-NEXT: s_load_b32 s0, s[4:5], 0x44 +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s2, s0, s2 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -2048,19 +2039,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB7_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX6-NEXT: s_mov_b32 m0, s5 -; GFX6-NEXT: v_readlane_b32 s8, v0, s5 -; GFX6-NEXT: v_writelane_b32 v1, s4, m0 -; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s3 +; GFX6-NEXT: v_readlane_b32 s8, v0, s3 +; GFX6-NEXT: v_writelane_b32 v1, s2, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_add_i32 s2, s2, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -2071,13 +2062,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB7_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB7_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2091,16 +2082,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2113,13 +2104,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB7_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 @@ -2132,16 +2123,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9-NEXT: s_mov_b32 m0, s3 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2154,13 +2145,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2172,16 +2163,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec -; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 ; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -2193,16 +2184,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -2217,12 +2207,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 -; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 ; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -2233,16 +2223,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -2254,18 +2243,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -2279,13 +2268,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -2303,14 +2292,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -2322,13 +2311,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB7_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -2342,18 +2331,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: s_mov_b32 s4, 0 +; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -2367,17 +2356,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB7_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -2393,14 +2383,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2414,14 +2404,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB7_4: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -2440,24 +2430,25 @@ entry: define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 offen glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_sub v2, v0, s[0:3], 0 offen glc +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2466,80 +2457,78 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc +; GFX9-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 offen glc +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sub_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_offset: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], 0 offen glc +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_offset: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], 0 offen glc +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_offset: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_offset: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 98d2e71f3975c2..9577230c6c52e2 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -26,7 +26,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: add_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 @@ -89,106 +89,106 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX9-NEXT: s_mul_i32 s2, s2, 5 -; GFX9-NEXT: s_mov_b32 s15, 0xf000 -; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: s_mul_i32 s2, s2, 5 +; GFX1064-NEXT: s_mul_i32 s6, s6, 5 ; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s8, s6 -; GFX1064-NEXT: s_mov_b32 s9, s7 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s0 -; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032-NEXT: s_mov_b32 s6, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: s_mul_i32 s5, s5, 5 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s5 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s8, s6 -; GFX1032-NEXT: s_mov_b32 s9, s7 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s0 -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -223,16 +223,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: add_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1132-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_mul_i32 s5, s5, 5 ; GFX1132-NEXT: s_mov_b32 s10, -1 @@ -257,7 +257,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: add_i32_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -294,22 +294,19 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: add_i32_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: s_mov_b32 s5, exec_lo +; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1232-NEXT: s_mov_b32 s6, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB0_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe -; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_mul_i32 s5, s5, 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: v_mov_b32_e32 v1, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 @@ -318,7 +315,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB0_2: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 @@ -337,321 +333,325 @@ entry: define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) { ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dword s8, s[4:5], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s8, s6 -; GFX7LESS-NEXT: s_mov_b32 s9, s7 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 -; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_mul_i32 s6, s8, s6 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s12, s2 +; GFX7LESS-NEXT: s_mov_b32 s13, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB1_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s8, s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 +; GFX8-NEXT: s_mov_b32 s12, s2 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX8-NEXT: s_mul_i32 s2, s8, s2 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s12, s6 -; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_mov_b32 s13, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s8, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dword s8, s[4:5], 0x34 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s2, s10, s2 +; GFX1064-NEXT: s_mul_i32 s6, s8, s6 ; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064-NEXT: s_mov_b32 s12, s2 +; GFX1064-NEXT: s_mov_b32 s13, s3 ; GFX1064-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v0, s[0:1] -; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s8, v0, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX1032-NEXT: s_mov_b32 s7, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s7, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s7 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032-NEXT: s_mul_i32 s5, s6, s5 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: s_mov_b32 s8, s6 -; GFX1032-NEXT: s_mov_b32 s9, s7 +; GFX1032-NEXT: v_mov_b32_e32 v1, s5 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s6, v0, s[2:3] +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s3, s2, s3 -; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-NEXT: s_mov_b32 s8, s6 -; GFX1164-NEXT: s_mov_b32 s9, s7 -; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1164-NEXT: s_mul_i32 s6, s8, s6 +; GFX1164-NEXT: s_mov_b32 s14, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164-NEXT: s_mov_b32 s12, s2 +; GFX1164-NEXT: s_mov_b32 s13, s3 +; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB1_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1] -; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 +; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[2:3] +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1132-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1132-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_mul_i32 s6, s4, s6 ; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: s_mov_b32 s8, s6 -; GFX1132-NEXT: s_mov_b32 s9, s7 +; GFX1132-NEXT: v_mov_b32_e32 v1, s6 +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 ; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB1_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] -; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 +; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[2:3] +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: add_i32_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1264-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1264-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX1264-NEXT: s_mov_b64 s[6:7], exec +; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1264-NEXT: s_cbranch_execz .LBB1_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_i32 s3, s2, s3 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-NEXT: s_mov_b32 s8, s6 -; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264-NEXT: s_wait_alu 0xfffe +; GFX1264-NEXT: s_mul_i32 s6, s8, s6 +; GFX1264-NEXT: s_mov_b32 s14, -1 +; GFX1264-NEXT: s_wait_alu 0xfffe +; GFX1264-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264-NEXT: s_mov_b32 s12, s2 +; GFX1264-NEXT: s_mov_b32 s13, s3 +; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB1_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 +; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1] -; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s8, v0, s[2:3] +; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_mov_b32 s2, -1 +; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: add_i32_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s8, exec_lo -; GFX1232-NEXT: s_mov_b32 s1, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1232-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1232-NEXT: s_mov_b32 s6, exec_lo +; GFX1232-NEXT: s_mov_b32 s5, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB1_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_wait_alu 0xfffe -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mul_i32 s2, s0, s2 +; GFX1232-NEXT: s_wait_alu 0xfffe +; GFX1232-NEXT: s_mul_i32 s6, s4, s6 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: v_mov_b32_e32 v1, s2 -; GFX1232-NEXT: s_mov_b32 s8, s6 -; GFX1232-NEXT: s_mov_b32 s9, s7 +; GFX1232-NEXT: s_wait_alu 0xfffe +; GFX1232-NEXT: v_mov_b32_e32 v1, s6 +; GFX1232-NEXT: s_mov_b32 s8, s2 +; GFX1232-NEXT: s_mov_b32 s9, s3 ; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB1_2: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 +; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[2:3] -; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v0, s[2:3] +; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_mov_b32 s2, -1 +; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(1) %inout, i32 %additive syncscope("agent") acq_rel @@ -667,18 +667,18 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s4 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s2 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 -; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB2_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -714,17 +714,17 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s4 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 ; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -755,137 +755,137 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-LABEL: add_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s4 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s8, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s8, s8, s6 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: s_mov_b32 s15, 0xf000 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s14, -1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s12, s6 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s13, s7 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s8 -; GFX9_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[12:15], 0 glc +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX9_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol ; GFX9_ITERATIVE-NEXT: .LBB2_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s7, 0xf000 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, -1 -; GFX9_ITERATIVE-NEXT: v_add_u32_e32 v0, s0, v1 -; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_add_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9_ITERATIVE-NEXT: s_endpgm ; ; GFX1064_ITERATIVE-LABEL: add_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s8, s6 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX1064_ITERATIVE-NEXT: s_add_i32 s8, s8, s7 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s8 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s6 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s7 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1064_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB2_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, -1 -; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s0, v1 -; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_ITERATIVE-NEXT: s_endpgm ; ; GFX1032_ITERATIVE-LABEL: add_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s6 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s7 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1032_ITERATIVE-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB2_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, -1 -; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s0, v1 -; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_ITERATIVE-NEXT: s_endpgm ; ; GFX1164_ITERATIVE-LABEL: add_i32_varying: @@ -899,15 +899,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 @@ -943,31 +943,31 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -978,7 +978,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB2_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -999,15 +999,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 ; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 @@ -1043,34 +1043,33 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE: ; %bb.0: ; %entry ; GFX1232_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo -; GFX1232_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5 +; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: -; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 @@ -1080,8 +1079,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB2_4: -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -1093,7 +1091,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX7LESS_DPP-LABEL: add_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 ; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 @@ -1111,7 +1109,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8_DPP-LABEL: add_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] @@ -1162,14 +1160,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_DPP-LABEL: add_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5] ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 @@ -1182,33 +1180,33 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s8, s6 -; GFX9_DPP-NEXT: s_mov_b32 s9, s7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 +; GFX9_DPP-NEXT: s_mov_b32 s8, s2 +; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX9_DPP-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9_DPP-NEXT: buffer_wbinvl1_vol ; GFX9_DPP-NEXT: .LBB2_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 -; GFX9_DPP-NEXT: s_mov_b32 s6, -1 -; GFX9_DPP-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: add_i32_varying: @@ -1222,53 +1220,53 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064_DPP-NEXT: s_mov_b32 s0, s9 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s9 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: s_mov_b32 s0, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s1, s7 -; GFX1064_DPP-NEXT: buffer_atomic_add v0, off, s[0:3], 0 glc +; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1064_DPP-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl1_inv ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB2_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: s_mov_b32 s6, s2 -; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: add_i32_varying: @@ -1281,44 +1279,44 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: s_mov_b32 s0, s6 -; GFX1032_DPP-NEXT: s_mov_b32 s1, s7 -; GFX1032_DPP-NEXT: buffer_atomic_add v0, off, s[0:3], 0 glc +; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1032_DPP-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB2_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: s_mov_b32 s6, s2 -; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: add_i32_varying: @@ -1338,14 +1336,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 @@ -1405,7 +1403,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 @@ -1464,18 +1462,17 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1483,7 +1480,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -1534,7 +1530,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 @@ -1587,7 +1583,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1625,7 +1621,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: add_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1662,114 +1658,115 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: add_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX9-NEXT: s_mul_i32 s2, s2, 5 -; GFX9-NEXT: s_mov_b32 s15, 0xf000 -; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc +; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB3_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_nop 2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB3_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s2, s2, 5 +; GFX1064-NEXT: s_mul_i32 s6, s6, 5 ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s8, s6 -; GFX1064-NEXT: s_mov_b32 s9, s7 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB3_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, s[0:1] -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032-NEXT: s_mov_b32 s6, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB3_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: s_mul_i32 s5, s5, 5 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s8, s6 -; GFX1032-NEXT: s_mov_b32 s9, s7 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB3_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, v2, 5, s[0:1] -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1806,16 +1803,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1132-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_mul_i32 s5, s5, 5 ; GFX1132-NEXT: s_mov_b32 s10, -1 @@ -1841,7 +1838,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: add_i64_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1880,10 +1877,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: add_i64_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: s_mov_b32 s4, exec_lo +; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1232-NEXT: s_mov_b32 s7, exec_lo ; GFX1232-NEXT: s_mov_b32 s5, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX1232-NEXT: s_mov_b32 s6, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1891,12 +1888,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_cbranch_execz .LBB3_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_wait_alu 0xfffe -; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 @@ -1905,6 +1900,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB3_2: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 @@ -1925,219 +1921,221 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: s_mov_b32 s12, s2 +; GFX7LESS-NEXT: s_mov_b32 s13, s3 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX7LESS-NEXT: s_mul_i32 s3, s5, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX7LESS-NEXT: s_mul_i32 s2, s4, s2 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s3, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB4_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s6, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s7, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 -; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s5, v2 +; GFX7LESS-NEXT: v_mul_hi_u32 v1, s4, v2 +; GFX7LESS-NEXT: v_mul_lo_u32 v2, s4, v2 ; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s2 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s3, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s6 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s7, v2 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s12, s6 -; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 -; GFX8-NEXT: s_mul_i32 s6, s1, s6 +; GFX8-NEXT: s_mov_b32 s12, s2 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s4, v0, 0 +; GFX8-NEXT: s_mul_i32 s2, s5, s2 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GFX8-NEXT: s_mov_b32 s13, s3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v3, s1, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1] -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_mul_lo_u32 v3, s5, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v2, v[0:1] +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s1, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s0, s6 +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mul_i32 s3, s7, s2 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2 +; GFX9-NEXT: s_add_i32 s8, s8, s3 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v2, v[0:1] -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s1, v2, v[1:2] -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v2, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s7, v2, v[1:2] +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_nop 2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i64_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s9, s1, s8 -; GFX1064-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1064-NEXT: s_mul_i32 s8, s0, s8 +; GFX1064-NEXT: s_mul_i32 s9, s7, s8 +; GFX1064-NEXT: s_mul_hi_u32 s10, s6, s8 +; GFX1064-NEXT: s_mul_i32 s8, s6, s8 ; GFX1064-NEXT: s_add_i32 s10, s10, s9 ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s10 ; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s8, s6 -; GFX1064-NEXT: s_mov_b32 s9, s7 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v2, s[2:3] -; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s1, v2, v[1:2] -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v2, s[2:3] +; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s7, v2, v[1:2] +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i64_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s8, s1, s3 -; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1032-NEXT: s_mul_i32 s3, s0, s3 +; GFX1032-NEXT: s_mul_i32 s8, s7, s5 +; GFX1032-NEXT: s_mul_hi_u32 s9, s6, s5 +; GFX1032-NEXT: s_mul_i32 s5, s6, s5 ; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s8, s6 -; GFX1032-NEXT: s_mov_b32 s9, s7 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v2, s[2:3] -; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s1, v2, v[1:2] -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s6, v2, s[2:3] +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s7, v2, v[1:2] +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i64_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 @@ -2148,87 +2146,87 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s9, s1, s8 -; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1164-NEXT: s_mul_i32 s8, s0, s8 +; GFX1164-NEXT: s_mul_i32 s9, s5, s8 +; GFX1164-NEXT: s_mul_hi_u32 s10, s4, s8 +; GFX1164-NEXT: s_mul_i32 s8, s4, s8 ; GFX1164-NEXT: s_add_i32 s10, s10, s9 ; GFX1164-NEXT: v_mov_b32_e32 v0, s8 ; GFX1164-NEXT: v_mov_b32_e32 v1, s10 ; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: s_mov_b32 s8, s6 -; GFX1164-NEXT: s_mov_b32 s9, s7 +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 ; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB4_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3] -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v1, v3 -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i64_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1132-NEXT: s_mov_b32 s7, exec_lo +; GFX1132-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s8 +; GFX1132-NEXT: s_bcnt1_i32_b32 s7, s7 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s8, s1, s3 -; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1132-NEXT: s_mul_i32 s3, s0, s3 +; GFX1132-NEXT: s_mul_i32 s8, s5, s7 +; GFX1132-NEXT: s_mul_hi_u32 s9, s4, s7 +; GFX1132-NEXT: s_mul_i32 s7, s4, s7 ; GFX1132-NEXT: s_add_i32 s9, s9, s8 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9 +; GFX1132-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s9 ; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: s_mov_b32 s8, s6 -; GFX1132-NEXT: s_mov_b32 s9, s7 +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 ; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB4_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3] -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v1, v3 -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: add_i64_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1264-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 @@ -2238,38 +2236,38 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11] +; GFX1264-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mov_b32_e32 v0, s8 ; GFX1264-NEXT: v_mov_b32_e32 v1, s9 ; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: s_mov_b32 s8, s6 -; GFX1264-NEXT: s_mov_b32 s9, s7 +; GFX1264-NEXT: s_mov_b32 s8, s2 +; GFX1264-NEXT: s_mov_b32 s9, s3 ; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB4_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3] -; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2] -; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: s_mov_b32 s2, -1 +; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s5, v2, v[1:2] +; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: add_i64_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s9, exec_lo -; GFX1232-NEXT: s_mov_b32 s3, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 +; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1232-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1232-NEXT: s_mov_b32 s6, exec_lo +; GFX1232-NEXT: s_mov_b32 s7, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1232-NEXT: s_mov_b32 s8, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2277,29 +2275,30 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_cbranch_execz .LBB4_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_wait_alu 0xfffe -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9 +; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] +; GFX1232-NEXT: s_wait_alu 0xfffe +; GFX1232-NEXT: s_mul_u64 s[6:7], s[4:5], s[6:7] ; GFX1232-NEXT: s_mov_b32 s14, -1 -; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX1232-NEXT: s_mov_b32 s12, s6 -; GFX1232-NEXT: s_mov_b32 s13, s7 +; GFX1232-NEXT: s_wait_alu 0xfffe +; GFX1232-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX1232-NEXT: s_mov_b32 s12, s2 +; GFX1232-NEXT: s_mov_b32 s13, s3 ; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB4_2: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3] -; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2] -; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3] +; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: s_mov_b32 s2, -1 +; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s5, v2, v[1:2] +; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive syncscope("agent") acq_rel @@ -2312,31 +2311,31 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 -; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 -; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s2 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s2 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s7, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s6, s6, s8 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB5_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 @@ -2344,13 +2343,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 ; GFX7LESS_ITERATIVE-NEXT: .LBB5_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -2367,44 +2366,44 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 -; GFX8_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s2 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8_ITERATIVE-NEXT: s_add_u32 s6, s6, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s7, m0 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX8_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol ; GFX8_ITERATIVE-NEXT: .LBB5_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 @@ -2418,160 +2417,160 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_ITERATIVE-LABEL: add_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s2 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s6, s6, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s7, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s6 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s7 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX9_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol ; GFX9_ITERATIVE-NEXT: .LBB5_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v4 -; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v3 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 -; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s1, v1 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: v_add_co_u32_e32 v0, vcc, s5, v1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s7, 0xf000 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, -1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX9_ITERATIVE-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v2, vcc -; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9_ITERATIVE-NEXT: s_endpgm ; ; GFX1064_ITERATIVE-LABEL: add_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 -; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 -; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s6 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s7 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1064_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB5_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v3 -; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, -1 -; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s0, v1 -; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v2, vcc -; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1064_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064_ITERATIVE-NEXT: s_endpgm ; ; GFX1032_ITERATIVE-LABEL: add_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 -; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s6 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s7 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1032_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB5_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v3 -; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, -1 -; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s0, v1 -; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v2, vcc_lo -; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1032_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032_ITERATIVE-NEXT: s_endpgm ; ; GFX1164_ITERATIVE-LABEL: add_i64_varying: @@ -2579,38 +2578,38 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s6 -; GFX1164_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s6 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s7 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2621,7 +2620,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB5_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -2637,35 +2636,35 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 -; GFX1132_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2676,7 +2675,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB5_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -2693,36 +2692,36 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] -; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] ; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s6 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s7 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 @@ -2732,7 +2731,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: .LBB5_4: -; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -2748,34 +2747,34 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE: ; %bb.0: ; %entry ; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo -; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 ; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 -; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 -; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 -; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo -; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: -; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 @@ -2785,7 +2784,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB5_4: -; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -2799,7 +2798,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX7LESS_DPP-LABEL: add_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -2818,7 +2817,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8_DPP-LABEL: add_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] @@ -2901,16 +2900,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_DPP-LABEL: add_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 @@ -2947,39 +2946,39 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s6 ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s8, s6 -; GFX9_DPP-NEXT: s_mov_b32 s9, s7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s2 +; GFX9_DPP-NEXT: s_mov_b32 s8, s2 +; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s7 ; GFX9_DPP-NEXT: buffer_atomic_add_x2 v[6:7], off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9_DPP-NEXT: buffer_wbinvl1_vol ; GFX9_DPP-NEXT: .LBB5_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v7 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v6 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v6 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v6, vcc, s1, v6 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v6, vcc, s5, v6 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 -; GFX9_DPP-NEXT: s_mov_b32 s6, -1 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v7, vcc, v0, v7, vcc -; GFX9_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[4:7], 0 +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: add_i64_varying: @@ -3021,70 +3020,70 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s10, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s2, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s3, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v7, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s11, v2, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v7, s8, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064_DPP-NEXT: v_writelane_b32 v7, s11, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s10, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s0 -; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s4 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: s_mov_b32 s0, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s1, s7 -; GFX1064_DPP-NEXT: buffer_atomic_add_x2 v[8:9], off, s[0:3], 0 glc +; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1064_DPP-NEXT: buffer_atomic_add_x2 v[8:9], off, s[4:7], 0 glc ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl1_inv ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB5_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v8 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v7 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_add_co_u32 v8, vcc, s0, v10 -; GFX1064_DPP-NEXT: s_mov_b32 s6, s2 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s1, v11, vcc -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1064_DPP-NEXT: v_add_co_u32 v8, vcc, s2, v10 +; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: add_i64_varying: @@ -3125,51 +3124,51 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s8, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: s_mov_b32 s0, s6 -; GFX1032_DPP-NEXT: s_mov_b32 s1, s7 -; GFX1032_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[0:3], 0 glc +; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1032_DPP-NEXT: buffer_atomic_add_x2 v[9:10], off, s[4:7], 0 glc ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB5_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 -; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s0, v11 -; GFX1032_DPP-NEXT: s_mov_b32 s6, s2 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 +; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s2, v11 +; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: add_i64_varying: @@ -3212,17 +3211,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3316,7 +3315,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo @@ -3403,17 +3402,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1264_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, s4 -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3430,7 +3429,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1264_DPP-NEXT: v_writelane_b32 v4, s8, 32 ; GFX1264_DPP-NEXT: v_writelane_b32 v5, s9, 32 -; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -3509,7 +3507,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo @@ -3568,7 +3566,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -3600,7 +3598,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 @@ -3633,109 +3631,109 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX9-NEXT: s_mul_i32 s2, s2, 5 -; GFX9-NEXT: s_mov_b32 s15, 0xf000 -; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB6_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: s_mul_i32 s2, s2, 5 +; GFX1064-NEXT: s_mul_i32 s6, s6, 5 ; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s8, s6 -; GFX1064-NEXT: s_mov_b32 s9, s7 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB6_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032-NEXT: s_mov_b32 s6, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: s_mul_i32 s5, s5, 5 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s5 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s8, s6 -; GFX1032-NEXT: s_mov_b32 s9, s7 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB6_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -3771,16 +3769,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: sub_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1132-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_mul_i32 s5, s5, 5 ; GFX1132-NEXT: s_mov_b32 s10, -1 @@ -3806,7 +3804,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: sub_i32_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -3844,22 +3842,19 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: sub_i32_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: s_mov_b32 s5, exec_lo +; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1232-NEXT: s_mov_b32 s6, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB6_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_wait_alu 0xfffe -; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_mul_i32 s5, s5, 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: v_mov_b32_e32 v1, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 @@ -3868,7 +3863,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB6_2: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 @@ -3888,327 +3882,331 @@ entry: define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %subitive) { ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dword s8, s[4:5], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s8, s6 -; GFX7LESS-NEXT: s_mov_b32 s9, s7 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 -; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_mul_i32 s6, s8, s6 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s12, s2 +; GFX7LESS-NEXT: s_mov_b32 s13, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB7_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s8, s[4:5], 0x34 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 +; GFX8-NEXT: s_mov_b32 s12, s2 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX8-NEXT: s_mul_i32 s2, s8, s2 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s12, s6 -; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_mov_b32 s13, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s8, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dword s8, s[4:5], 0x34 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s2, s10, s2 +; GFX1064-NEXT: s_mul_i32 s6, s8, s6 ; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b32 s12, s6 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064-NEXT: s_mov_b32 s12, s2 +; GFX1064-NEXT: s_mov_b32 s13, s3 ; GFX1064-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB7_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s10, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX1032-NEXT: s_mov_b32 s7, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s7, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s7 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032-NEXT: s_mul_i32 s5, s6, s5 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: s_mov_b32 s8, s6 -; GFX1032-NEXT: s_mov_b32 s9, s7 +; GFX1032-NEXT: v_mov_b32_e32 v1, s5 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s6, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB7_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s3, s2, s3 -; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-NEXT: s_mov_b32 s8, s6 -; GFX1164-NEXT: s_mov_b32 s9, s7 -; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1164-NEXT: s_mul_i32 s6, s8, s6 +; GFX1164-NEXT: s_mov_b32 s14, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164-NEXT: s_mov_b32 s12, s2 +; GFX1164-NEXT: s_mov_b32 s13, s3 +; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB7_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1132-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1132-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_mul_i32 s6, s4, s6 ; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 -; GFX1132-NEXT: s_mov_b32 s8, s6 -; GFX1132-NEXT: s_mov_b32 s9, s7 +; GFX1132-NEXT: v_mov_b32_e32 v1, s6 +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 ; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB7_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: sub_i32_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1264-NEXT: s_mov_b64 s[8:9], exec -; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1264-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX1264-NEXT: s_mov_b64 s[6:7], exec +; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1264-NEXT: s_cbranch_execz .LBB7_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_i32 s3, s2, s3 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-NEXT: s_mov_b32 s8, s6 -; GFX1264-NEXT: s_mov_b32 s9, s7 -; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264-NEXT: s_wait_alu 0xfffe +; GFX1264-NEXT: s_mul_i32 s6, s8, s6 +; GFX1264-NEXT: s_mov_b32 s14, -1 +; GFX1264-NEXT: s_wait_alu 0xfffe +; GFX1264-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264-NEXT: s_mov_b32 s12, s2 +; GFX1264-NEXT: s_mov_b32 s13, s3 +; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB7_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 +; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1264-NEXT: s_mov_b32 s2, -1 +; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: sub_i32_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s8, exec_lo -; GFX1232-NEXT: s_mov_b32 s1, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1232-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1232-NEXT: s_mov_b32 s6, exec_lo +; GFX1232-NEXT: s_mov_b32 s5, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB7_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_wait_alu 0xfffe -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mul_i32 s2, s0, s2 +; GFX1232-NEXT: s_wait_alu 0xfffe +; GFX1232-NEXT: s_mul_i32 s6, s4, s6 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: v_mov_b32_e32 v1, s2 -; GFX1232-NEXT: s_mov_b32 s8, s6 -; GFX1232-NEXT: s_mov_b32 s9, s7 +; GFX1232-NEXT: s_wait_alu 0xfffe +; GFX1232-NEXT: v_mov_b32_e32 v1, s6 +; GFX1232-NEXT: s_mov_b32 s8, s2 +; GFX1232-NEXT: s_mov_b32 s9, s3 ; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB7_2: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1232-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 +; GFX1232-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1232-NEXT: s_mov_b32 s2, -1 +; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel @@ -4224,18 +4222,18 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s4 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s2 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 -; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB8_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -4271,17 +4269,17 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s4 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 ; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -4312,137 +4310,137 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-LABEL: sub_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s4 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s4 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s8, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s8, s8, s6 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: s_mov_b32 s15, 0xf000 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s14, -1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s12, s6 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s13, s7 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s8 -; GFX9_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[12:15], 0 glc +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 +; GFX9_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol ; GFX9_ITERATIVE-NEXT: .LBB8_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s7, 0xf000 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, -1 -; GFX9_ITERATIVE-NEXT: v_sub_u32_e32 v0, s0, v1 -; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX9_ITERATIVE-NEXT: v_sub_u32_e32 v0, s4, v1 +; GFX9_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9_ITERATIVE-NEXT: s_endpgm ; ; GFX1064_ITERATIVE-LABEL: sub_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s8, s6 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX1064_ITERATIVE-NEXT: s_add_i32 s8, s8, s7 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s8 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s6 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s7 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1064_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB8_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, -1 -; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s0, v1 -; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_ITERATIVE-NEXT: s_endpgm ; ; GFX1032_ITERATIVE-LABEL: sub_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s6 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s7 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1032_ITERATIVE-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB8_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, -1 -; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s0, v1 -; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_ITERATIVE-NEXT: s_endpgm ; ; GFX1164_ITERATIVE-LABEL: sub_i32_varying: @@ -4456,15 +4454,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 @@ -4500,31 +4498,31 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -4535,7 +4533,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB8_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -4556,15 +4554,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 -; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 +; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 ; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 @@ -4600,34 +4598,33 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE: ; %bb.0: ; %entry ; GFX1232_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo -; GFX1232_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5 +; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: -; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 @@ -4637,8 +4634,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB8_4: -; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 @@ -4650,7 +4646,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX7LESS_DPP-LABEL: sub_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 ; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 @@ -4668,7 +4664,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8_DPP-LABEL: sub_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] @@ -4719,14 +4715,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_DPP-LABEL: sub_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5] ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 1 @@ -4739,33 +4735,33 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s8, s6 -; GFX9_DPP-NEXT: s_mov_b32 s9, s7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 +; GFX9_DPP-NEXT: s_mov_b32 s8, s2 +; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s6 ; GFX9_DPP-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9_DPP-NEXT: buffer_wbinvl1_vol ; GFX9_DPP-NEXT: .LBB8_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 -; GFX9_DPP-NEXT: s_mov_b32 s6, -1 -; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: sub_i32_varying: @@ -4779,53 +4775,53 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064_DPP-NEXT: s_mov_b32 s0, s9 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064_DPP-NEXT: s_mov_b32 s4, s9 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: s_mov_b32 s0, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s1, s7 -; GFX1064_DPP-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 glc +; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1064_DPP-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl1_inv ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB8_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: s_mov_b32 s6, s2 -; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: sub_i32_varying: @@ -4838,44 +4834,44 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s4, s6 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: s_mov_b32 s0, s6 -; GFX1032_DPP-NEXT: s_mov_b32 s1, s7 -; GFX1032_DPP-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 glc +; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1032_DPP-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB8_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: s_mov_b32 s6, s2 -; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: sub_i32_varying: @@ -4895,14 +4891,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 @@ -4962,7 +4958,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 @@ -5021,18 +5017,17 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1264_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -5040,7 +5035,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -5091,7 +5085,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 @@ -5144,7 +5138,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -5182,7 +5176,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -5220,122 +5214,122 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; GFX9-NEXT: s_mul_i32 s2, s2, 5 -; GFX9-NEXT: s_mov_b32 s15, 0xf000 -; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc +; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB9_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s2, s2, 5 +; GFX1064-NEXT: s_mul_i32 s6, s6, 5 ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s8, s6 -; GFX1064-NEXT: s_mov_b32 s9, s7 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v0 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032-NEXT: s_mov_b32 s6, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 +; GFX1032-NEXT: s_mul_i32 s5, s5, 5 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s8, s6 -; GFX1032-NEXT: s_mov_b32 s9, s7 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -5375,16 +5369,16 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1132-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB9_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_mul_i32 s5, s5, 5 ; GFX1132-NEXT: s_mov_b32 s10, -1 @@ -5413,7 +5407,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: sub_i64_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -5455,10 +5449,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: sub_i64_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232-NEXT: s_mov_b32 s4, exec_lo +; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1232-NEXT: s_mov_b32 s7, exec_lo ; GFX1232-NEXT: s_mov_b32 s5, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX1232-NEXT: s_mov_b32 s6, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5466,12 +5460,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_cbranch_execz .LBB9_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_wait_alu 0xfffe -; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 @@ -5480,6 +5472,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB9_2: +; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 @@ -5503,228 +5496,228 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: s_mov_b32 s12, s2 +; GFX7LESS-NEXT: s_mov_b32 s13, s3 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX7LESS-NEXT: s_mul_i32 s3, s5, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX7LESS-NEXT: s_mul_i32 s2, s4, s2 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s3, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB10_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s6, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s7, v0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 -; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s5, v2 +; GFX7LESS-NEXT: v_mul_hi_u32 v1, s4, v2 +; GFX7LESS-NEXT: v_mul_lo_u32 v2, s4, v2 ; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s2 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s3, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s6 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s7, v2 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_cbranch_execz .LBB10_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s12, s6 -; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 -; GFX8-NEXT: s_mul_i32 s6, s1, s6 +; GFX8-NEXT: s_mov_b32 s12, s2 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s4, v0, 0 +; GFX8-NEXT: s_mul_i32 s2, s5, s2 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GFX8-NEXT: s_mov_b32 s13, s3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB10_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v4, s5, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s4, v2, 0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: v_readfirstlane_b32 s5, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s1, v2 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v2 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s1, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s0, s6 +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mul_i32 s3, s7, s2 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2 +; GFX9-NEXT: s_add_i32 s8, s8, s3 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB10_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s0, v2, 0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s1, v2, v[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s6, v2, 0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s7, v2, v[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i64_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s9, s1, s8 -; GFX1064-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1064-NEXT: s_mul_i32 s8, s0, s8 +; GFX1064-NEXT: s_mul_i32 s9, s7, s8 +; GFX1064-NEXT: s_mul_hi_u32 s10, s6, s8 +; GFX1064-NEXT: s_mul_i32 s8, s6, s8 ; GFX1064-NEXT: s_add_i32 s10, s10, s9 ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s10 ; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s8, s6 -; GFX1064-NEXT: s_mov_b32 s9, s7 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB10_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s0, v2, 0 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s1, v2, v[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3 +; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s6, v2, 0 +; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s7, v2, v[4:5] +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i64_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s8, s1, s3 -; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1032-NEXT: s_mul_i32 s3, s0, s3 +; GFX1032-NEXT: s_mul_i32 s8, s7, s5 +; GFX1032-NEXT: s_mul_hi_u32 s9, s6, s5 +; GFX1032-NEXT: s_mul_i32 s5, s6, s5 ; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s8, s6 -; GFX1032-NEXT: s_mov_b32 s9, s7 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB10_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s0, v2, 0 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s1, v2, v[4:5] -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 +; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s6, v2, 0 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s7, v2, v[4:5] +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i64_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1164-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164-NEXT: s_mov_b64 s[8:9], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 @@ -5735,91 +5728,91 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s9, s1, s8 -; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1164-NEXT: s_mul_i32 s8, s0, s8 +; GFX1164-NEXT: s_mul_i32 s9, s5, s8 +; GFX1164-NEXT: s_mul_hi_u32 s10, s4, s8 +; GFX1164-NEXT: s_mul_i32 s8, s4, s8 ; GFX1164-NEXT: s_add_i32 s10, s10, s9 ; GFX1164-NEXT: v_mov_b32_e32 v0, s8 ; GFX1164-NEXT: v_mov_b32_e32 v1, s10 ; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: s_mov_b32 s8, s6 -; GFX1164-NEXT: s_mov_b32 s9, s7 +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 ; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB10_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0 -; GFX1164-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, 0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5] -; GFX1164-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5] +; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v1, v5 -; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc -; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i64_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1132-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1132-NEXT: s_mov_b32 s7, exec_lo +; GFX1132-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB10_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s8 +; GFX1132-NEXT: s_bcnt1_i32_b32 s7, s7 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s8, s1, s3 -; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1132-NEXT: s_mul_i32 s3, s0, s3 +; GFX1132-NEXT: s_mul_i32 s8, s5, s7 +; GFX1132-NEXT: s_mul_hi_u32 s9, s4, s7 +; GFX1132-NEXT: s_mul_i32 s7, s4, s7 ; GFX1132-NEXT: s_add_i32 s9, s9, s8 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9 +; GFX1132-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s9 ; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: s_mov_b32 s8, s6 -; GFX1132-NEXT: s_mov_b32 s9, s7 +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 ; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB10_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0 -; GFX1132-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5] -; GFX1132-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s4, v2, 0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5] +; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v1, v5 -; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm ; ; GFX1264-LABEL: sub_i64_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1264-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1264-NEXT: s_mov_b64 s[2:3], exec +; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 @@ -5829,42 +5822,42 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: ; %bb.1: ; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11] +; GFX1264-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mov_b32_e32 v0, s8 ; GFX1264-NEXT: v_mov_b32_e32 v1, s9 ; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: s_mov_b32 s8, s6 -; GFX1264-NEXT: s_mov_b32 s9, s7 +; GFX1264-NEXT: s_mov_b32 s8, s2 +; GFX1264-NEXT: s_mov_b32 s9, s3 ; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB10_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0 -; GFX1264-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5] -; GFX1264-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s0, v3 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s4, v2, 0 +; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5] +; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v3 +; GFX1264-NEXT: s_mov_b32 s2, -1 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mov_b32_e32 v1, v4 -; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc -; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1264-NEXT: s_endpgm ; ; GFX1232-LABEL: sub_i64_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s9, exec_lo -; GFX1232-NEXT: s_mov_b32 s3, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 +; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1232-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1232-NEXT: s_mov_b32 s6, exec_lo +; GFX1232-NEXT: s_mov_b32 s7, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1232-NEXT: s_mov_b32 s8, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5872,33 +5865,34 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_cbranch_execz .LBB10_2 ; GFX1232-NEXT: ; %bb.1: ; GFX1232-NEXT: s_wait_alu 0xfffe -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9 +; GFX1232-NEXT: s_bcnt1_i32_b32 s6, s6 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] +; GFX1232-NEXT: s_wait_alu 0xfffe +; GFX1232-NEXT: s_mul_u64 s[6:7], s[4:5], s[6:7] ; GFX1232-NEXT: s_mov_b32 s14, -1 -; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX1232-NEXT: s_mov_b32 s12, s6 -; GFX1232-NEXT: s_mov_b32 s13, s7 +; GFX1232-NEXT: s_wait_alu 0xfffe +; GFX1232-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX1232-NEXT: s_mov_b32 s12, s2 +; GFX1232-NEXT: s_mov_b32 s13, s3 ; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB10_2: -; GFX1232-NEXT: s_wait_alu 0xfffe ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232-NEXT: s_wait_kmcnt 0x0 -; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0 -; GFX1232-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5] -; GFX1232-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s4, v2, 0 +; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5] +; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 +; GFX1232-NEXT: s_mov_b32 s2, -1 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-NEXT: v_mov_b32_e32 v1, v4 -; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: %old = atomicrmw sub ptr addrspace(1) %inout, i64 %subitive syncscope("agent") acq_rel @@ -5911,31 +5905,31 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 -; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 -; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s2 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s2 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s7, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s6, s6, s8 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB11_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 @@ -5943,13 +5937,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX7LESS_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: buffer_wbinvl1 ; GFX7LESS_ITERATIVE-NEXT: .LBB11_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -5966,44 +5960,44 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 -; GFX8_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s2 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX8_ITERATIVE-NEXT: s_add_u32 s6, s6, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s7, m0 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX8_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc ; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX8_ITERATIVE-NEXT: buffer_wbinvl1_vol ; GFX8_ITERATIVE-NEXT: .LBB11_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 @@ -6017,160 +6011,160 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_ITERATIVE-LABEL: sub_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s2 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s6, s6, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s7, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s6 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s7 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX9_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX9_ITERATIVE-NEXT: buffer_wbinvl1_vol ; GFX9_ITERATIVE-NEXT: .LBB11_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v4 -; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v3 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 -; GFX9_ITERATIVE-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v1 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_ITERATIVE-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_ITERATIVE-NEXT: s_mov_b32 s7, 0xf000 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, -1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX9_ITERATIVE-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v2, vcc -; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9_ITERATIVE-NEXT: s_endpgm ; ; GFX1064_ITERATIVE-LABEL: sub_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 -; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 -; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s6 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s7 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1064_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB11_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v3 -; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, -1 -; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s0, v1 -; GFX1064_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v2, vcc -; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1064_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1064_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064_ITERATIVE-NEXT: s_endpgm ; ; GFX1032_ITERATIVE-LABEL: sub_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 -; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s6 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s7 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s8, s2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1032_ITERATIVE-NEXT: buffer_atomic_sub_x2 v[3:4], off, s[8:11], 0 glc ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB11_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s0, v3 -; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s1, v4 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s6, -1 -; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v1 -; GFX1032_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v2, vcc_lo -; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1032_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1032_ITERATIVE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032_ITERATIVE-NEXT: s_endpgm ; ; GFX1164_ITERATIVE-LABEL: sub_i64_varying: @@ -6178,38 +6172,38 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s6 -; GFX1164_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s6 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s7 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -6220,7 +6214,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB11_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -6236,35 +6230,35 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 -; GFX1132_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 +; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -6275,7 +6269,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB11_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -6292,36 +6286,36 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10 ; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] -; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] ; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s6 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s7 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 @@ -6331,7 +6325,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: .LBB11_4: -; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -6347,34 +6341,34 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE: ; %bb.0: ; %entry ; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo -; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 +; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 ; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 -; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 -; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 -; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] +; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3] ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo -; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: -; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 @@ -6384,7 +6378,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB11_4: -; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 @@ -6398,7 +6392,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX7LESS_DPP-LABEL: sub_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -6417,7 +6411,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8_DPP-LABEL: sub_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] @@ -6500,16 +6494,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_DPP-LABEL: sub_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 @@ -6546,39 +6540,39 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s6 ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s8, s6 -; GFX9_DPP-NEXT: s_mov_b32 s9, s7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s2 +; GFX9_DPP-NEXT: s_mov_b32 s8, s2 +; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s7 ; GFX9_DPP-NEXT: buffer_atomic_sub_x2 v[6:7], off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9_DPP-NEXT: buffer_wbinvl1_vol ; GFX9_DPP-NEXT: .LBB11_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v7 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v6 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v6 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX9_DPP-NEXT: v_sub_co_u32_e32 v6, vcc, s1, v6 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_sub_co_u32_e32 v6, vcc, s5, v6 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 -; GFX9_DPP-NEXT: s_mov_b32 s6, -1 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_subb_co_u32_e32 v7, vcc, v0, v7, vcc -; GFX9_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[4:7], 0 +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: sub_i64_varying: @@ -6620,70 +6614,70 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s10, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s2, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s3, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v7, s6, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s7, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s11, v2, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v7, s8, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064_DPP-NEXT: v_writelane_b32 v7, s11, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s10, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s0 -; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s4 +; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: s_mov_b32 s0, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s1, s7 -; GFX1064_DPP-NEXT: buffer_atomic_sub_x2 v[8:9], off, s[0:3], 0 glc +; GFX1064_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1064_DPP-NEXT: buffer_atomic_sub_x2 v[8:9], off, s[4:7], 0 glc ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl1_inv ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB11_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s2, v8 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v7 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_sub_co_u32 v8, vcc, s0, v10 -; GFX1064_DPP-NEXT: s_mov_b32 s6, s2 -; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s1, v11, vcc -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v9 +; GFX1064_DPP-NEXT: v_sub_co_u32 v8, vcc, s2, v10 +; GFX1064_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: sub_i64_varying: @@ -6724,51 +6718,51 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s8, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: v_readlane_b32 s7, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: s_mov_b32 s0, s6 -; GFX1032_DPP-NEXT: s_mov_b32 s1, s7 -; GFX1032_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[0:3], 0 glc +; GFX1032_DPP-NEXT: s_mov_b32 s4, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 +; GFX1032_DPP-NEXT: buffer_atomic_sub_x2 v[9:10], off, s[4:7], 0 glc ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl1_inv ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB11_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s2, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 -; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s0, v11 -; GFX1032_DPP-NEXT: s_mov_b32 s6, s2 -; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 +; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s2, v11 +; GFX1032_DPP-NEXT: s_mov_b32 s2, s6 +; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s3, v12, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: sub_i64_varying: @@ -6811,17 +6805,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6915,7 +6909,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo @@ -7002,17 +6996,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1264_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, s4 -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -7029,7 +7023,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1264_DPP-NEXT: v_writelane_b32 v4, s8, 32 ; GFX1264_DPP-NEXT: v_writelane_b32 v5, s9, 32 -; GFX1264_DPP-NEXT: s_wait_alu 0xfffe ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -7108,7 +7101,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index f7773b4859dc2c..9c2527ae4781bd 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -24,24 +24,24 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB0_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -52,24 +52,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; ; GFX8-LABEL: add_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -80,23 +80,23 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -107,26 +107,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: s_mul_i32 s2, s2, 5 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -154,8 +153,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -166,26 +164,26 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: add_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s4, s4, 5 +; GFX1164-NEXT: s_mul_i32 s2, s2, 5 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -214,7 +212,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -232,26 +230,26 @@ entry: define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) { ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s4, s6, s4 +; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB1_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -263,26 +261,26 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s6, s4 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 @@ -294,25 +292,25 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s6, s4 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -324,28 +322,27 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s4, s6, s4 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -356,59 +353,58 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1032-LABEL: add_i32_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX1032-NEXT: s_mov_b32 s4, exec_lo +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s4, s0, s4 -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b32 s6, s[2:3], 0x2c -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s4, s6, s4 +; GFX1164-NEXT: s_mul_i32 s2, s6, s2 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB1_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -420,26 +416,26 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1132-LABEL: add_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x2c -; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s4, s0, s4 +; GFX1132-NEXT: s_mul_i32 s2, s0, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB1_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s6, -1 @@ -458,19 +454,19 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: add_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB2_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -482,13 +478,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB2_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -500,16 +496,16 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-LABEL: add_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 @@ -523,13 +519,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB2_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -541,16 +537,16 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-LABEL: add_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 @@ -564,12 +560,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX9_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB2_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -581,16 +577,16 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: add_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -603,15 +599,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB2_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 @@ -627,12 +622,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -651,8 +646,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: .LBB2_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 @@ -665,18 +659,18 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -691,13 +685,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB2_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -715,14 +709,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -740,7 +734,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB2_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -752,7 +746,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: add_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -783,7 +777,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -792,13 +786,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB2_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -828,7 +822,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -837,12 +831,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX9_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB2_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -863,45 +857,46 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB2_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: add_i32_varying: @@ -916,7 +911,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -924,26 +919,27 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB2_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: add_i32_varying: @@ -963,48 +959,49 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB2_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: add_i32_varying: @@ -1025,7 +1022,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -1033,26 +1030,27 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1132_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB2_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -1473,24 +1471,24 @@ entry: define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -1506,26 +1504,26 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; ; GFX8-LABEL: add_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] @@ -1538,25 +1536,25 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; ; GFX9-LABEL: add_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] @@ -1569,26 +1567,25 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064-NEXT: s_mul_i32 s2, s2, 5 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 ; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] @@ -1617,8 +1614,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] @@ -1630,26 +1626,26 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s4, s4, 5 +; GFX1164-NEXT: s_mul_i32 s2, s2, 5 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164-NEXT: v_mov_b32_e32 v0, s2 ; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB4_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1680,7 +1676,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB4_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1700,7 +1696,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1741,7 +1737,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1779,117 +1775,116 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s7, s2 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2 -; GFX9-NEXT: s_add_i32 s8, s8, s3 -; GFX9-NEXT: s_mul_i32 s2, s6, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_mul_i32 s7, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_i32 s6, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1] -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2] -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_nop 2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s3, s7, s2 -; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s2 -; GFX1064-NEXT: s_mul_i32 s2, s6, s2 -; GFX1064-NEXT: s_add_i32 s8, s8, s3 -; GFX1064-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064-NEXT: s_mul_i32 s7, s3, s6 +; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX1064-NEXT: s_mul_i32 s6, s2, s6 +; GFX1064-NEXT: s_add_i32 s8, s8, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064-NEXT: v_mov_b32_e32 v1, s8 ; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB5_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, s[0:1] -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2] -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] +; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032-NEXT: s_mov_b32 s6, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s7, s1 -; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s1 -; GFX1032-NEXT: s_mul_i32 s1, s6, s1 -; GFX1032-NEXT: s_add_i32 s3, s3, s2 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: s_mul_i32 s6, s3, s5 +; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 +; GFX1032-NEXT: s_mul_i32 s5, s2, s5 +; GFX1032-NEXT: s_add_i32 s7, s7, s6 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 +; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB5_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s6, v2, s[0:1] -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s7, v2, v[1:2] -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] +; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i64_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1928,16 +1923,16 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1132-LABEL: add_i64_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1132-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s6, s3, s5 @@ -1972,13 +1967,13 @@ entry: define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: add_i64_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 @@ -1987,8 +1982,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 ; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB6_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -1996,8 +1991,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -2007,8 +2002,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB6_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 @@ -2022,13 +2017,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: add_i64_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 @@ -2037,16 +2032,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -2056,8 +2051,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB6_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 @@ -2071,13 +2066,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: add_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 @@ -2086,16 +2081,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -2104,8 +2099,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB6_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 @@ -2120,12 +2115,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: add_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 @@ -2133,16 +2128,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -2153,9 +2148,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB6_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 @@ -2169,28 +2163,28 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-LABEL: add_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -2201,9 +2195,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB6_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 @@ -2218,12 +2211,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 @@ -2233,9 +2226,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -2243,9 +2236,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 @@ -2255,8 +2248,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB6_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2271,32 +2264,32 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-LABEL: add_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 @@ -2305,8 +2298,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB6_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2320,7 +2313,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: add_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2377,8 +2370,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -2387,14 +2380,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_add_rtn_u64 v[5:6], v7, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB6_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -2454,8 +2447,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -2464,13 +2457,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX9_DPP-NEXT: ds_add_rtn_u64 v[5:6], v7, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB6_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -2524,11 +2517,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 @@ -2538,30 +2531,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v8, s4, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v8, s2, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v7, s3, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v8, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v7, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: v_writelane_b32 v8, s9, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v7, s8, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 @@ -2571,25 +2564,26 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB6_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s0, v11 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s1, v12, vcc +; GFX1064_DPP-NEXT: s_mov_b32 null, 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s3, v11 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s4, v12, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: add_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 @@ -2623,23 +2617,23 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v8, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v7, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 @@ -2649,17 +2643,18 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB6_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s0, v11 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 null, 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s3, v11 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s4, v12, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: add_i64_varying: @@ -2703,30 +2698,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -2734,14 +2729,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -2750,28 +2745,28 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB6_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32 v7, vcc, s0, v9 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v8, vcc, s1, v10, vcc +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32 v7, vcc, s3, v9 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v8, vcc, s4, v10, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: add_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s2 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) @@ -2800,27 +2795,27 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v2, 15 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v7, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 @@ -2828,18 +2823,18 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB6_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s0, v10 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s3, v10 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -3440,24 +3435,24 @@ entry: define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB8_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -3469,24 +3464,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB8_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB8_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -3498,23 +3493,23 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -3526,26 +3521,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: sub_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: s_mul_i32 s2, s2, 5 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -3574,8 +3568,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -3587,26 +3580,26 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: sub_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s4, s4, 5 +; GFX1164-NEXT: s_mul_i32 s2, s2, 5 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB8_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -3636,7 +3629,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB8_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -3655,26 +3648,26 @@ entry: define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) { ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s4, s6, s4 +; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB9_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -3686,26 +3679,26 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB9_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s6, s4 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 @@ -3717,25 +3710,25 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s6, s4 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -3747,27 +3740,27 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s4, s6, s4 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -3779,59 +3772,59 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1032-LABEL: sub_i32_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX1032-NEXT: s_mov_b32 s4, exec_lo +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s4, s0, s4 -; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b32 s6, s[2:3], 0x2c -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB9_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s4, s6, s4 +; GFX1164-NEXT: s_mul_i32 s2, s6, s2 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB9_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 @@ -3844,26 +3837,26 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1132-LABEL: sub_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x2c -; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s4, s0, s4 +; GFX1132-NEXT: s_mul_i32 s2, s0, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB9_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 @@ -3883,19 +3876,19 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB10_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -3907,13 +3900,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB10_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -3925,16 +3918,16 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-LABEL: sub_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 @@ -3948,13 +3941,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB10_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -3966,16 +3959,16 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-LABEL: sub_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 @@ -3989,12 +3982,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB10_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -4006,16 +3999,16 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: sub_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -4028,15 +4021,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB10_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 @@ -4052,12 +4044,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -4076,8 +4068,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: .LBB10_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 @@ -4090,18 +4081,18 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -4116,13 +4107,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB10_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4140,14 +4131,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -4165,7 +4156,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB10_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4177,7 +4168,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: sub_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4208,7 +4199,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -4217,13 +4208,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB10_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -4253,7 +4244,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -4262,12 +4253,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX9_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB10_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -4288,45 +4279,46 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB10_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: sub_i32_varying: @@ -4341,7 +4333,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4349,26 +4341,27 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB10_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: sub_i32_varying: @@ -4388,48 +4381,49 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB10_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: sub_i32_varying: @@ -4450,7 +4444,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4458,26 +4452,27 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1132_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB10_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -4898,24 +4893,24 @@ entry: define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB12_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -4931,24 +4926,24 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; ; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB12_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_readfirstlane_b32 s5, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -4964,23 +4959,23 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; ; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -4996,26 +4991,25 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064-NEXT: s_mul_i32 s2, s2, 5 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 ; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB12_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -5047,8 +5041,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: .LBB12_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -5063,26 +5056,26 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB12_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s4, s4, 5 +; GFX1164-NEXT: s_mul_i32 s2, s2, 5 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164-NEXT: v_mov_b32_e32 v0, s2 ; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB12_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -5116,7 +5109,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB12_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -5139,7 +5132,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -5180,7 +5173,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -5219,124 +5212,124 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s7, s2 -; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2 -; GFX9-NEXT: s_add_i32 s8, s8, s3 -; GFX9-NEXT: s_mul_i32 s2, s6, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_mul_i32 s7, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_i32 s6, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB13_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s7, v2, v[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s5, v3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s1, v3 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s3, s7, s2 -; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s2 -; GFX1064-NEXT: s_mul_i32 s2, s6, s2 -; GFX1064-NEXT: s_add_i32 s8, s8, s3 -; GFX1064-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064-NEXT: s_mul_i32 s7, s3, s6 +; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX1064-NEXT: s_mul_i32 s6, s2, s6 +; GFX1064-NEXT: s_add_i32 s8, s8, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064-NEXT: v_mov_b32_e32 v1, s8 ; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB13_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v2, v[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3 +; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, v4 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032-NEXT: s_mov_b32 s6, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s7, s1 -; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s1 -; GFX1032-NEXT: s_mul_i32 s1, s6, s1 -; GFX1032-NEXT: s_add_i32 s3, s3, s2 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: s_mul_i32 s6, s3, s5 +; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 +; GFX1032-NEXT: s_mul_i32 s5, s2, s5 +; GFX1032-NEXT: s_add_i32 s7, s7, s6 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 +; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB13_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s6, v2, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s7, v2, v[4:5] -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 +; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, v4 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i64_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -5377,16 +5370,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1132-LABEL: sub_i64_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1132-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB13_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s6, s3, s5 @@ -5423,13 +5416,13 @@ entry: define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: sub_i64_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 @@ -5438,8 +5431,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 ; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB14_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -5447,8 +5440,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -5458,8 +5451,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB14_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 @@ -5473,13 +5466,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: sub_i64_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 @@ -5488,16 +5481,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -5507,8 +5500,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB14_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 @@ -5522,13 +5515,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: sub_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 @@ -5537,16 +5530,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -5555,8 +5548,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB14_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 @@ -5571,12 +5564,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: sub_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 @@ -5584,16 +5577,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -5604,9 +5597,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB14_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 @@ -5620,28 +5612,28 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-LABEL: sub_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -5652,9 +5644,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB14_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 @@ -5669,12 +5660,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 @@ -5684,9 +5675,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -5694,9 +5685,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 @@ -5706,8 +5697,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB14_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5722,32 +5713,32 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-LABEL: sub_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 @@ -5756,8 +5747,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB14_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5771,7 +5762,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: sub_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5828,8 +5819,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -5838,14 +5829,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[5:6], v7, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB14_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -5905,8 +5896,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -5915,13 +5906,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[5:6], v7, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB14_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -5975,11 +5966,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc, v2, v5, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v1, vcc, v1, v3 @@ -5989,30 +5980,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v8, s4, 16 -; GFX1064_DPP-NEXT: v_writelane_b32 v7, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v8, s2, 16 +; GFX1064_DPP-NEXT: v_writelane_b32 v7, s3, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v8, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v7, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: v_writelane_b32 v8, s9, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v7, s8, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 @@ -6022,25 +6013,26 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB14_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s0, v11 -; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s1, v12, vcc +; GFX1064_DPP-NEXT: s_mov_b32 null, 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s3, v11 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s4, v12, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: sub_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 @@ -6074,23 +6066,23 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v8, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v7, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 @@ -6100,17 +6092,18 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB14_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s0, v11 -; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 null, 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s3, v11 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s4, v12, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: sub_i64_varying: @@ -6154,30 +6147,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -6185,14 +6178,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -6201,28 +6194,28 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB14_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_sub_co_u32 v7, vcc, s0, v9 -; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v8, vcc, s1, v10, vcc +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_sub_co_u32 v7, vcc, s3, v9 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v8, vcc, s4, v10, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: sub_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s2 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) @@ -6251,27 +6244,27 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v2, 15 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v7, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 @@ -6279,18 +6272,18 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB14_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s0, v10 -; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s3, v10 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -6304,19 +6297,19 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: and_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX7LESS_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB15_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -6328,13 +6321,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB15_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -6346,16 +6339,16 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-LABEL: and_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 @@ -6369,13 +6362,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB15_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -6387,16 +6380,16 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-LABEL: and_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 @@ -6410,12 +6403,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX9_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB15_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -6427,16 +6420,16 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: and_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -6449,15 +6442,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB15_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1 @@ -6473,12 +6465,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -6497,8 +6489,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: .LBB15_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1 @@ -6511,18 +6502,18 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -6537,13 +6528,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB15_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6561,14 +6552,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -6586,7 +6577,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB15_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6598,7 +6589,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: and_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6628,7 +6619,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -6638,13 +6629,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_cbranch_execz .LBB15_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB15_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -6673,7 +6664,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -6683,12 +6674,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_cbranch_execz .LBB15_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB15_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -6709,45 +6700,46 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB15_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB15_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: and_i32_varying: @@ -6762,34 +6754,35 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB15_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 ; GFX1032_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB15_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: and_i32_varying: @@ -6809,48 +6802,49 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB15_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB15_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: and_i32_varying: @@ -6871,34 +6865,35 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB15_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 ; GFX1132_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB15_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -6910,21 +6905,21 @@ entry: define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: and_i64_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB16_1 @@ -6933,8 +6928,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -6944,8 +6939,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB16_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 @@ -6958,13 +6953,13 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: and_i64_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 @@ -6972,16 +6967,16 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -6991,8 +6986,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB16_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -7005,13 +7000,13 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: and_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 @@ -7019,16 +7014,16 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -7037,8 +7032,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB16_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -7052,28 +7047,28 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: and_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -7084,9 +7079,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB16_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2 @@ -7100,27 +7094,27 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-LABEL: and_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 ; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -7131,9 +7125,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB16_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2 @@ -7148,21 +7141,21 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -7170,9 +7163,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 @@ -7182,8 +7175,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB16_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -7198,29 +7191,29 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-LABEL: and_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 ; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 @@ -7229,8 +7222,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB16_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -7244,7 +7237,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: and_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7281,8 +7274,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s5, v3, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v4, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -7291,14 +7284,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_and_rtn_b64 v[5:6], v7, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB16_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 @@ -7337,8 +7330,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s5, v3, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v4, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -7347,13 +7340,13 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX9_DPP-NEXT: ds_and_rtn_b64 v[5:6], v7, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB16_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 @@ -7385,26 +7378,26 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s2, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 @@ -7412,14 +7405,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -7429,24 +7422,25 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB16_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_and_b32_e32 v9, s0, v9 -; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s1, v8 +; GFX1064_DPP-NEXT: s_mov_b32 null, 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1064_DPP-NEXT: v_and_b32_e32 v9, s3, v9 +; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: and_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s4 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -7461,23 +7455,23 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -7487,17 +7481,18 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB16_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_and_b32_e32 v9, s0, v9 -; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s1, v8 +; GFX1032_DPP-NEXT: s_mov_b32 null, 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1032_DPP-NEXT: v_and_b32_e32 v9, s3, v9 +; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: and_i64_varying: @@ -7528,29 +7523,29 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s2, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 @@ -7559,14 +7554,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -7575,27 +7570,27 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB16_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, s0, v9 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s1, v8 +; GFX1164_DPP-NEXT: v_and_b32_e32 v9, s3, v9 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: and_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s4 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s2 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, -1 :: v_dual_mov_b32 v5, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -7615,25 +7610,25 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 @@ -7641,18 +7636,18 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB16_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_and_b32_e32 v9, s0, v9 -; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s1, v8 +; GFX1132_DPP-NEXT: v_and_b32_e32 v9, s3, v9 +; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -7666,19 +7661,19 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: or_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX7LESS_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB17_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -7690,13 +7685,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB17_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -7708,16 +7703,16 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-LABEL: or_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 @@ -7731,13 +7726,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB17_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -7749,16 +7744,16 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-LABEL: or_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 @@ -7772,12 +7767,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX9_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB17_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -7789,16 +7784,16 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: or_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -7811,15 +7806,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB17_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1 @@ -7835,12 +7829,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -7859,8 +7853,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: .LBB17_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1 @@ -7873,18 +7866,18 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -7899,13 +7892,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB17_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7923,14 +7916,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -7948,7 +7941,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB17_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7960,7 +7953,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: or_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7991,7 +7984,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -8000,13 +7993,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB17_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -8036,7 +8029,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -8045,12 +8038,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX9_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB17_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -8071,45 +8064,46 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB17_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: or_i32_varying: @@ -8124,7 +8118,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8132,26 +8126,27 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB17_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: or_i32_varying: @@ -8171,48 +8166,49 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB17_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: or_i32_varying: @@ -8233,7 +8229,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8241,26 +8237,27 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1132_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB17_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -8272,21 +8269,21 @@ entry: define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: or_i64_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB18_1 @@ -8295,8 +8292,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -8306,8 +8303,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB18_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 @@ -8320,13 +8317,13 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: or_i64_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 @@ -8334,16 +8331,16 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -8353,8 +8350,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB18_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -8367,13 +8364,13 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: or_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 @@ -8381,16 +8378,16 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -8399,8 +8396,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB18_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -8414,28 +8411,28 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: or_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -8446,9 +8443,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB18_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2 @@ -8462,27 +8458,27 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-LABEL: or_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 ; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -8493,9 +8489,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB18_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2 @@ -8510,21 +8505,21 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -8532,9 +8527,9 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 @@ -8544,8 +8539,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB18_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -8560,29 +8555,29 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-LABEL: or_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 ; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 @@ -8591,8 +8586,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB18_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -8606,7 +8601,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: or_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8643,8 +8638,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -8653,14 +8648,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_or_rtn_b64 v[5:6], v7, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB18_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -8699,8 +8694,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -8709,13 +8704,13 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX9_DPP-NEXT: ds_or_rtn_b64 v[5:6], v7, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB18_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -8747,26 +8742,26 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s2, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 @@ -8774,14 +8769,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -8791,24 +8786,25 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB18_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_or_b32_e32 v9, s0, v9 -; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s1, v8 +; GFX1064_DPP-NEXT: s_mov_b32 null, 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1064_DPP-NEXT: v_or_b32_e32 v9, s3, v9 +; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s4, v8 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: or_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8823,23 +8819,23 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -8849,17 +8845,18 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB18_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_or_b32_e32 v9, s0, v9 -; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s1, v8 +; GFX1032_DPP-NEXT: s_mov_b32 null, 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1032_DPP-NEXT: v_or_b32_e32 v9, s3, v9 +; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s4, v8 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: or_i64_varying: @@ -8890,29 +8887,29 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s2, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 @@ -8921,14 +8918,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -8937,27 +8934,27 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB18_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_or_b32_e32 v9, s0, v9 -; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s1, v8 +; GFX1164_DPP-NEXT: v_or_b32_e32 v9, s3, v9 +; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s4, v8 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: or_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8977,25 +8974,25 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 @@ -9003,18 +9000,18 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB18_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_or_b32_e32 v9, s0, v9 -; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s1, v8 +; GFX1132_DPP-NEXT: v_or_b32_e32 v9, s3, v9 +; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s4, v8 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -9028,19 +9025,19 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: xor_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX7LESS_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB19_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -9052,13 +9049,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB19_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -9070,16 +9067,16 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-LABEL: xor_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 @@ -9093,13 +9090,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB19_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -9111,16 +9108,16 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-LABEL: xor_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 @@ -9134,12 +9131,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB19_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -9151,16 +9148,16 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: xor_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -9173,15 +9170,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB19_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1 @@ -9197,12 +9193,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -9221,8 +9217,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: .LBB19_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1 @@ -9235,18 +9230,18 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -9261,13 +9256,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB19_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9285,14 +9280,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -9310,7 +9305,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB19_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9322,7 +9317,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: xor_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9353,7 +9348,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -9362,13 +9357,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB19_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -9398,7 +9393,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -9407,12 +9402,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX9_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB19_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -9433,45 +9428,46 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB19_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: xor_i32_varying: @@ -9486,7 +9482,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9494,26 +9490,27 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB19_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: xor_i32_varying: @@ -9533,48 +9530,49 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB19_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: xor_i32_varying: @@ -9595,7 +9593,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -9603,26 +9601,27 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1132_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB19_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -9634,21 +9633,21 @@ entry: define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: xor_i64_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] ; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB20_1 @@ -9657,8 +9656,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -9668,8 +9667,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB20_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 @@ -9682,13 +9681,13 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: xor_i64_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 @@ -9696,16 +9695,16 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -9715,8 +9714,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB20_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -9729,13 +9728,13 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: xor_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 @@ -9743,16 +9742,16 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -9761,8 +9760,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB20_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -9776,28 +9775,28 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: xor_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -9808,9 +9807,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB20_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2 @@ -9824,27 +9822,27 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-LABEL: xor_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8 ; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -9855,9 +9853,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB20_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2 @@ -9872,21 +9869,21 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -9894,9 +9891,9 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 @@ -9906,8 +9903,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB20_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -9922,29 +9919,29 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-LABEL: xor_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 ; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 @@ -9953,8 +9950,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB20_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -9968,7 +9965,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: xor_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10005,8 +10002,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -10015,14 +10012,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v7, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB20_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -10061,8 +10058,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -10071,13 +10068,13 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v7, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB20_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 @@ -10109,26 +10106,26 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s2, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 @@ -10136,14 +10133,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -10153,24 +10150,25 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB20_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_xor_b32_e32 v9, s0, v9 -; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 +; GFX1064_DPP-NEXT: s_mov_b32 null, 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v9, s3, v9 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s4, v8 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: xor_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10185,23 +10183,23 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -10211,17 +10209,18 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB20_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_xor_b32_e32 v9, s0, v9 -; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 +; GFX1032_DPP-NEXT: s_mov_b32 null, 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v9, s3, v9 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s4, v8 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: xor_i64_varying: @@ -10252,29 +10251,29 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s2, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 @@ -10283,14 +10282,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -10299,27 +10298,27 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB20_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_xor_b32_e32 v9, s0, v9 -; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v9, s3, v9 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s4, v8 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: xor_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10339,25 +10338,25 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 @@ -10365,18 +10364,18 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB20_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v8 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_xor_b32_e32 v9, s0, v9 -; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v9, s3, v9 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s4, v8 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -10390,19 +10389,19 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: max_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s2, 1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX7LESS_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB21_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -10414,13 +10413,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB21_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -10432,16 +10431,16 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-LABEL: max_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX8_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX8_ITERATIVE-NEXT: s_brev_b32 s2, 1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 @@ -10455,13 +10454,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB21_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -10473,16 +10472,16 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-LABEL: max_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX9_ITERATIVE-NEXT: s_brev_b32 s2, 1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 @@ -10496,12 +10495,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX9_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB21_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -10513,16 +10512,16 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: max_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX1064_ITERATIVE-NEXT: s_brev_b32 s2, 1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -10535,15 +10534,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB21_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1 @@ -10559,12 +10557,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -10583,8 +10581,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: .LBB21_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1 @@ -10597,18 +10594,18 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_ITERATIVE-NEXT: s_brev_b32 s4, 1 +; GFX1164_ITERATIVE-NEXT: s_brev_b32 s2, 1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -10623,13 +10620,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB21_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10647,14 +10644,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -10672,7 +10669,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB21_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10684,7 +10681,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: max_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10714,7 +10711,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -10724,13 +10721,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_cbranch_execz .LBB21_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB21_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -10759,7 +10756,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -10769,12 +10766,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_cbranch_execz .LBB21_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB21_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -10795,45 +10792,46 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB21_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB21_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: max_i32_varying: @@ -10848,34 +10846,35 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB21_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 ; GFX1032_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB21_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: max_i32_varying: @@ -10895,48 +10894,49 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB21_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB21_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: max_i32_varying: @@ -10957,34 +10957,35 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB21_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 ; GFX1132_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB21_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -11011,7 +11012,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB22_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -11045,9 +11046,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc @@ -11078,9 +11079,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc @@ -11113,8 +11114,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: .LBB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc @@ -11145,8 +11145,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: .LBB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo @@ -11178,7 +11177,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB22_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc @@ -11209,7 +11208,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB22_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo @@ -11232,14 +11231,14 @@ entry: define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: max_i64_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 @@ -11252,8 +11251,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 ; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB23_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -11261,8 +11260,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -11272,8 +11271,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB23_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 @@ -11289,14 +11288,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: max_i64_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 @@ -11309,16 +11308,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -11328,8 +11327,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB23_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] @@ -11345,14 +11344,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: max_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 @@ -11365,16 +11364,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -11383,8 +11382,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB23_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] @@ -11401,13 +11400,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: max_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 @@ -11417,16 +11416,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -11437,9 +11436,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB23_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[1:2] @@ -11454,31 +11452,31 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-LABEL: max_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -11489,9 +11487,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB23_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[1:2] @@ -11507,14 +11504,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .p2align 6 ; GFX1164_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 @@ -11526,9 +11523,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -11536,9 +11533,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 @@ -11548,8 +11545,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB23_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11565,36 +11562,36 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-LABEL: max_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132_ITERATIVE-NEXT: .p2align 6 ; GFX1132_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 @@ -11603,8 +11600,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB23_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11619,7 +11616,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: max_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11687,8 +11684,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -11697,17 +11694,17 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_max_rtn_i64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB23_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v8 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 @@ -11777,8 +11774,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -11787,16 +11784,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s2 ; GFX9_DPP-NEXT: ds_max_rtn_i64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB23_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v8 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 @@ -11853,10 +11850,10 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v5, 1 @@ -11868,16 +11865,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s2, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s3, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v4, s7, 32 @@ -11885,14 +11882,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -11902,27 +11899,28 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB23_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: s_mov_b32 null, 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[7:8] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: max_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, 0, s4 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, 0, s2 ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v6, 1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -11960,23 +11958,23 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -11986,18 +11984,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB23_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: s_mov_b32 null, 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[7:8] -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: max_i64_varying: @@ -12055,11 +12054,11 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf @@ -12073,16 +12072,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s3, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s7, 32 @@ -12091,14 +12090,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -12107,29 +12106,29 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB23_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[7:8] -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: max_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, 0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, 0, s2 ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v6, 1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -12172,24 +12171,24 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[1:2], v[3:4] ; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 @@ -12197,19 +12196,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB23_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[7:8] -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc_lo -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -12223,19 +12222,19 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: min_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s2, -2 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX7LESS_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB24_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -12247,13 +12246,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB24_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -12265,16 +12264,16 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-LABEL: min_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX8_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX8_ITERATIVE-NEXT: s_brev_b32 s2, -2 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 @@ -12288,13 +12287,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB24_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -12306,16 +12305,16 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-LABEL: min_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX9_ITERATIVE-NEXT: s_brev_b32 s2, -2 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 @@ -12329,12 +12328,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX9_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB24_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -12346,16 +12345,16 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: min_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX1064_ITERATIVE-NEXT: s_brev_b32 s2, -2 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -12368,15 +12367,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB24_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1 @@ -12392,12 +12390,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -12416,8 +12414,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: .LBB24_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1 @@ -12430,18 +12427,18 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_ITERATIVE-NEXT: s_brev_b32 s4, -2 +; GFX1164_ITERATIVE-NEXT: s_brev_b32 s2, -2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -12456,13 +12453,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB24_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12480,14 +12477,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -12505,7 +12502,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB24_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12517,7 +12514,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: min_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -12547,7 +12544,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -12557,13 +12554,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_cbranch_execz .LBB24_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB24_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -12592,7 +12589,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -12602,12 +12599,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_cbranch_execz .LBB24_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB24_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -12628,45 +12625,46 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB24_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB24_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: min_i32_varying: @@ -12681,34 +12679,35 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB24_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 ; GFX1032_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB24_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: min_i32_varying: @@ -12728,48 +12727,49 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB24_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB24_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: min_i32_varying: @@ -12790,34 +12790,35 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB24_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 ; GFX1132_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB24_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -12844,7 +12845,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB25_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -12878,9 +12879,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB25_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc @@ -12911,9 +12912,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB25_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc @@ -12946,8 +12947,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: .LBB25_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc @@ -12978,8 +12978,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: .LBB25_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo @@ -13011,7 +13010,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB25_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc @@ -13042,7 +13041,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB25_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo @@ -13065,14 +13064,14 @@ entry: define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: min_i64_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 @@ -13085,8 +13084,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 ; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB26_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -13094,8 +13093,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -13105,8 +13104,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB26_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 @@ -13122,14 +13121,14 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: min_i64_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 @@ -13142,16 +13141,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -13161,8 +13160,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB26_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] @@ -13178,14 +13177,14 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: min_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 @@ -13198,16 +13197,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -13216,8 +13215,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB26_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] @@ -13234,13 +13233,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: min_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 @@ -13250,16 +13249,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -13270,9 +13269,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB26_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] @@ -13287,31 +13285,31 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-LABEL: min_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -13322,9 +13320,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB26_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[1:2] @@ -13340,14 +13337,14 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .p2align 6 ; GFX1164_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 @@ -13359,9 +13356,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -13369,9 +13366,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 @@ -13381,8 +13378,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB26_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13398,36 +13395,36 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-LABEL: min_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132_ITERATIVE-NEXT: .p2align 6 ; GFX1132_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 @@ -13436,8 +13433,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB26_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13452,7 +13449,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: min_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -13520,8 +13517,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -13530,17 +13527,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_min_rtn_i64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB26_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v8 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 @@ -13610,8 +13607,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -13620,16 +13617,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s2 ; GFX9_DPP-NEXT: ds_min_rtn_i64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB26_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v8 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 @@ -13686,10 +13683,10 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v4, -2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v5, -2 @@ -13701,16 +13698,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s2, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s3, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v4, s7, 32 @@ -13718,14 +13715,14 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -13735,27 +13732,28 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB26_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: s_mov_b32 null, 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: min_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fffffff, 0, s4 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fffffff, 0, s2 ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v4, -2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s2 ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v6, -2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -13793,23 +13791,23 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -13819,18 +13817,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB26_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: s_mov_b32 null, 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[7:8] -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: min_i64_varying: @@ -13888,11 +13887,11 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf @@ -13906,16 +13905,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s3, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s7, 32 @@ -13924,14 +13923,14 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -13940,29 +13939,29 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB26_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8] -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: min_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fffffff, 0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fffffff, 0, s2 ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s2 ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v6, -2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -14005,24 +14004,24 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[1:2], v[3:4] ; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 @@ -14030,19 +14029,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB26_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[7:8] -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc_lo -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -14056,19 +14055,19 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: umax_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX7LESS_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB27_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -14080,13 +14079,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB27_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -14098,16 +14097,16 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-LABEL: umax_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 @@ -14121,13 +14120,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB27_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -14139,16 +14138,16 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-LABEL: umax_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 @@ -14162,12 +14161,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX9_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB27_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -14179,16 +14178,16 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: umax_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -14201,15 +14200,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB27_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1 @@ -14225,12 +14223,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -14249,8 +14247,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: .LBB27_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1 @@ -14263,18 +14260,18 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -14289,13 +14286,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB27_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -14313,14 +14310,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -14338,7 +14335,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB27_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -14350,7 +14347,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: umax_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -14381,7 +14378,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -14390,13 +14387,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB27_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -14426,7 +14423,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -14435,12 +14432,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s2 ; GFX9_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB27_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -14461,45 +14458,46 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB27_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: umax_i32_varying: @@ -14514,7 +14512,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14522,26 +14520,27 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB27_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: umax_i32_varying: @@ -14561,48 +14560,49 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB27_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umax_i32_varying: @@ -14623,7 +14623,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -14631,26 +14631,27 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX1132_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB27_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -14677,7 +14678,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB28_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -14710,7 +14711,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB28_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -14742,7 +14743,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB28_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -14776,8 +14777,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: .LBB28_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -14808,8 +14808,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: .LBB28_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -14841,7 +14840,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB28_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -14872,7 +14871,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB28_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 @@ -14895,13 +14894,13 @@ entry: define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: umax_i64_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 @@ -14914,8 +14913,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 ; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB29_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -14923,8 +14922,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -14934,8 +14933,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB29_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 @@ -14951,13 +14950,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: umax_i64_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 @@ -14970,16 +14969,16 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -14989,8 +14988,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB29_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] @@ -15006,13 +15005,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: umax_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 @@ -15025,16 +15024,16 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -15043,8 +15042,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB29_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] @@ -15061,12 +15060,12 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: umax_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 @@ -15076,16 +15075,16 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -15096,9 +15095,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB29_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[1:2] @@ -15113,30 +15111,30 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-LABEL: umax_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -15147,9 +15145,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB29_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[1:2] @@ -15165,13 +15162,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .p2align 6 ; GFX1164_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 @@ -15184,8 +15181,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -15193,9 +15190,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 @@ -15205,8 +15202,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB29_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -15222,35 +15219,35 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-LABEL: umax_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132_ITERATIVE-NEXT: .p2align 6 ; GFX1132_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 @@ -15259,8 +15256,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB29_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -15275,7 +15272,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: umax_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -15344,8 +15341,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -15354,17 +15351,17 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_max_rtn_u64 v[5:6], v7, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB29_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v6 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6] ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 @@ -15435,8 +15432,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -15445,16 +15442,16 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s2 ; GFX9_DPP-NEXT: ds_max_rtn_u64 v[5:6], v7, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB29_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v6 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6] ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 @@ -15511,10 +15508,10 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 @@ -15526,16 +15523,16 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s2, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s3, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v4, s7, 32 @@ -15543,14 +15540,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -15560,27 +15557,28 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB29_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: s_mov_b32 null, 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[7:8] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: umax_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s4 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -15618,23 +15616,23 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -15644,18 +15642,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB29_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: s_mov_b32 null, 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[7:8] -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: umax_i64_varying: @@ -15713,11 +15712,11 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf @@ -15731,16 +15730,16 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s3, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s7, 32 @@ -15749,14 +15748,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -15765,29 +15764,29 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB29_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[7:8] -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umax_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s2 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -15824,25 +15823,24 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_cndmask_b32 v2, v4, v2 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 @@ -15850,19 +15848,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB29_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[7:8] -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc_lo -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -15876,19 +15874,19 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: umin_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX7LESS_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX7LESS_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB30_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -15900,13 +15898,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB30_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -15918,16 +15916,16 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-LABEL: umin_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 @@ -15941,13 +15939,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB30_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -15959,16 +15957,16 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-LABEL: umin_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 -; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3 +; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 @@ -15982,12 +15980,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX9_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB30_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -15999,16 +15997,16 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: umin_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 -; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3 +; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -16021,15 +16019,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB30_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1 @@ -16045,12 +16042,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -16069,8 +16066,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: .LBB30_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1 @@ -16083,18 +16079,18 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, -1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 -; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3 +; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 +; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -16109,13 +16105,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u32 v1, v1, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB30_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -16133,14 +16129,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s5 +; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -16158,7 +16154,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB30_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -16170,7 +16166,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: umin_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -16200,7 +16196,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -16210,13 +16206,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_cbranch_execz .LBB30_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB30_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -16245,7 +16241,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -16255,12 +16251,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_cbranch_execz .LBB30_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB30_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -16281,45 +16277,46 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB30_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB30_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: umin_i32_varying: @@ -16334,34 +16331,35 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB30_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s0 ; GFX1032_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB30_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: umin_i32_varying: @@ -16381,48 +16379,49 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1 ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 16 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v3, s2, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB30_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB30_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umin_i32_varying: @@ -16443,34 +16442,35 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s2, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB30_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s0 ; GFX1132_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB30_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() @@ -16497,7 +16497,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB31_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -16530,7 +16530,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB31_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -16562,7 +16562,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB31_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -16596,8 +16596,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: .LBB31_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -16628,8 +16627,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: .LBB31_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo @@ -16661,7 +16659,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB31_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -16692,7 +16690,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB31_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo @@ -16715,13 +16713,13 @@ entry: define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: umin_i64_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 @@ -16734,8 +16732,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 ; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB32_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -16743,8 +16741,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -16754,8 +16752,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB32_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 @@ -16771,13 +16769,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: umin_i64_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 @@ -16790,16 +16788,16 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 ; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -16809,8 +16807,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB32_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] @@ -16826,13 +16824,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: umin_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3] ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 @@ -16845,16 +16843,16 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 ; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -16863,8 +16861,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB32_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] @@ -16881,12 +16879,12 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-LABEL: umin_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 @@ -16896,16 +16894,16 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -16916,9 +16914,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB32_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2] @@ -16933,30 +16930,30 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-LABEL: umin_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3 ; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 @@ -16967,9 +16964,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB32_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[1:2] @@ -16985,13 +16981,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE: ; %bb.0: ; %entry ; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .p2align 6 ; GFX1164_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 @@ -17004,8 +17000,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 @@ -17013,9 +17009,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 @@ -17025,8 +17021,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB32_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -17042,35 +17038,35 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-LABEL: umin_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132_ITERATIVE-NEXT: .p2align 6 ; GFX1132_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 @@ -17079,8 +17075,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB32_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -17095,7 +17091,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: umin_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -17163,8 +17159,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: v_readlane_b32 s5, v3, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v4, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v3, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v4, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -17173,17 +17169,17 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s2 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_min_rtn_u64 v[6:7], v8, v[6:7] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB32_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 @@ -17253,8 +17249,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: v_readlane_b32 s5, v3, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v4, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v3, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v4, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v4 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] @@ -17263,16 +17259,16 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s2 ; GFX9_DPP-NEXT: ds_min_rtn_u64 v[6:7], v8, v[6:7] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB32_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v1 -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 @@ -17329,10 +17325,10 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s3 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 @@ -17344,16 +17340,16 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v4, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s2, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v4, s3, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s8, v2, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v2, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v1, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v4, s7, 32 @@ -17361,14 +17357,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -17378,27 +17374,28 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB32_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064_DPP-NEXT: s_mov_b32 null, 0 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[7:8] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: umin_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s4 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -17436,23 +17433,23 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -17462,18 +17459,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB32_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032_DPP-NEXT: s_mov_b32 null, 0 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[7:8] -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: umin_i64_varying: @@ -17531,11 +17529,11 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf @@ -17549,16 +17547,16 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v4, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s2, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v4, s3, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s8, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v1, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s6, 32 ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s7, 32 @@ -17567,14 +17565,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 @@ -17583,29 +17581,29 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB32_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[7:8] -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umin_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry ; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s2 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_mov_b32 v3, -1 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s2 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, -1 :: v_dual_mov_b32 v5, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -17642,25 +17640,24 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[1:2], v[3:4] ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, -1 :: v_dual_cndmask_b32 v2, v4, v2 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, -1 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v2, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 @@ -17668,19 +17665,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB32_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v8 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[7:8] -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s1, vcc_lo -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index a1f7d2ca3d3383..905a515d7c125c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -18,23 +18,23 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32, ptr addrspace(8), i32, i define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX6-NEXT: s_mul_i32 s4, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_mul_i32 s2, s2, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -46,23 +46,23 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: add_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 @@ -74,23 +74,23 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -101,26 +101,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W64-LABEL: add_i32_constant: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX10W64-NEXT: s_mul_i32 s2, s2, 5 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -137,18 +136,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc +; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -158,25 +156,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s2, 5 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -196,16 +194,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -217,27 +215,27 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W64-LABEL: add_i32_constant: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[4:5], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX12W64-NEXT: s_mul_i32 s2, s2, 5 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -257,7 +255,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -265,11 +263,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -287,24 +285,24 @@ entry: define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) { ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x11 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s4, s6, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_mul_i32 s2, s6, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -317,24 +315,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s6, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -347,24 +345,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -376,27 +374,26 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -406,55 +403,55 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 -; GFX10W32-NEXT: s_mov_b32 s4, exec_lo +; GFX10W32-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX10W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W32-NEXT: s_mul_i32 s2, s0, s2 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_load_b32 s6, s[4:5], 0x44 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -466,25 +463,25 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 -; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: s_load_b32 s0, s[4:5], 0x44 +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s2, s0, s2 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) @@ -496,28 +493,28 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W64-LABEL: add_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 -; GFX12W64-NEXT: s_mov_b64 s[4:5], exec +; GFX12W64-NEXT: s_load_b32 s6, s[4:5], 0x44 +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX12W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -529,31 +526,30 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: add_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 -; GFX12W32-NEXT: s_mov_b32 s4, exec_lo +; GFX12W32-NEXT: s_load_b32 s0, s[4:5], 0x44 +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s2, s0, s2 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -569,19 +565,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX6-NEXT: s_mov_b32 m0, s5 -; GFX6-NEXT: v_readlane_b32 s8, v0, s5 -; GFX6-NEXT: v_writelane_b32 v1, s4, m0 -; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s3 +; GFX6-NEXT: v_readlane_b32 s8, v0, s3 +; GFX6-NEXT: v_writelane_b32 v1, s2, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_add_i32 s2, s2, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -592,13 +588,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB2_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB2_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -612,16 +608,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 @@ -634,13 +630,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -653,16 +649,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9-NEXT: s_mov_b32 m0, s3 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 @@ -675,13 +671,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -693,16 +689,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec -; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 ; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -714,16 +710,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -738,12 +733,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 -; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 ; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -754,16 +749,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -775,18 +769,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -800,13 +794,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -824,14 +818,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -843,13 +837,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -862,18 +856,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: s_mov_b32 s4, 0 +; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -887,17 +881,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -913,14 +908,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 @@ -934,14 +929,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -959,24 +954,25 @@ entry: define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 offen glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_add v2, v0, s[0:3], 0 offen glc +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -985,80 +981,78 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc +; GFX9-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 offen glc +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_offset: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], 0 offen glc +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_offset: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], 0 offen glc +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_offset: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_offset: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_endpgm entry: @@ -1071,23 +1065,23 @@ entry: define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB4_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX6-NEXT: s_mul_i32 s4, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_mul_i32 s2, s2, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB4_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1100,23 +1094,23 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1129,23 +1123,23 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1157,26 +1151,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W64-LABEL: sub_i32_constant: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX10W64-NEXT: s_mul_i32 s2, s2, 5 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB4_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1194,18 +1187,17 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc +; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB4_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1216,25 +1208,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s2, 5 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB4_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1255,16 +1247,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB4_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1277,27 +1269,27 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W64-LABEL: sub_i32_constant: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[4:5], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX12W64-NEXT: s_mul_i32 s2, s2, 5 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB4_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1318,7 +1310,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -1326,11 +1318,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB4_2: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1349,24 +1341,24 @@ entry: define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) { ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x11 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s4, s6, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_mul_i32 s2, s6, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1379,24 +1371,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s6, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1409,24 +1401,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1438,25 +1430,25 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) @@ -1468,55 +1460,55 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 -; GFX10W32-NEXT: s_mov_b32 s4, exec_lo +; GFX10W32-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX10W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W32-NEXT: s_mul_i32 s2, s0, s2 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_load_b32 s6, s[4:5], 0x44 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) @@ -1529,25 +1521,25 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 -; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: s_load_b32 s0, s[4:5], 0x44 +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s2, s0, s2 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) @@ -1560,28 +1552,28 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W64-LABEL: sub_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 -; GFX12W64-NEXT: s_mov_b64 s[4:5], exec +; GFX12W64-NEXT: s_load_b32 s6, s[4:5], 0x44 +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX12W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1594,27 +1586,27 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: sub_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 -; GFX12W32-NEXT: s_mov_b32 s4, exec_lo +; GFX12W32-NEXT: s_load_b32 s0, s[4:5], 0x44 +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s2, s0, s2 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1634,19 +1626,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB6_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX6-NEXT: s_mov_b32 m0, s5 -; GFX6-NEXT: v_readlane_b32 s8, v0, s5 -; GFX6-NEXT: v_writelane_b32 v1, s4, m0 -; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s3 +; GFX6-NEXT: v_readlane_b32 s8, v0, s3 +; GFX6-NEXT: v_writelane_b32 v1, s2, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_add_i32 s2, s2, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -1657,13 +1649,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB6_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB6_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1677,16 +1669,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB6_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 @@ -1699,13 +1691,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB6_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB6_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 @@ -1718,16 +1710,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB6_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9-NEXT: s_mov_b32 m0, s3 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 @@ -1740,13 +1732,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB6_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB6_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1758,16 +1750,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec -; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 ; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -1779,16 +1771,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1803,12 +1794,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 -; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 ; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -1819,16 +1810,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB6_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1840,18 +1830,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -1865,13 +1855,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1889,14 +1879,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -1908,13 +1898,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB6_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1928,18 +1918,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: s_mov_b32 s4, 0 +; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -1953,17 +1943,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1979,14 +1970,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 @@ -2000,14 +1991,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_4: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -2026,24 +2017,25 @@ entry: define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 offen glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc +; GFX8-NEXT: buffer_atomic_sub v2, v0, s[0:3], 0 offen glc +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2052,80 +2044,78 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc +; GFX9-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 offen glc +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sub_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc +; GFX10-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_offset: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], 0 offen glc +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_offset: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], 0 offen glc +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_offset: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_offset: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index d0987068841b9b..9801e6ede5eebd 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -18,24 +18,24 @@ declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32, ptr addrspace(8), i32 define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX6-NEXT: s_mul_i32 s4, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_mul_i32 s2, s2, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -47,24 +47,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: add_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 @@ -76,24 +76,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -104,27 +104,26 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W64-LABEL: add_i32_constant: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_mul_i32 s2, s2, 5 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -141,19 +140,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -163,26 +161,26 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX11W64-NEXT: s_mul_i32 s2, s2, 5 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB0_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -202,17 +200,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB0_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -224,28 +222,28 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W64-LABEL: add_i32_constant: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[4:5], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX12W64-NEXT: s_mul_i32 s2, s2, 5 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -265,7 +263,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -273,11 +271,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -295,25 +293,25 @@ entry: define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %additive) { ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x11 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s4, s6, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_mul_i32 s2, s6, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB1_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -326,25 +324,25 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s6, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -357,25 +355,25 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -387,28 +385,27 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -418,57 +415,57 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 -; GFX10W32-NEXT: s_mov_b32 s4, exec_lo +; GFX10W32-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX10W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W32-NEXT: s_mul_i32 s2, s0, s2 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: s_mov_b32 null, 0 -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_load_b32 s6, s[4:5], 0x44 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB1_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -480,26 +477,26 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 -; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: s_load_b32 s0, s[4:5], 0x44 +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s2, s0, s2 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB1_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) @@ -511,29 +508,29 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W64-LABEL: add_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 -; GFX12W64-NEXT: s_mov_b64 s[4:5], exec +; GFX12W64-NEXT: s_load_b32 s6, s[4:5], 0x44 +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX12W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -545,31 +542,30 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: add_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 -; GFX12W32-NEXT: s_mov_b32 s4, exec_lo +; GFX12W32-NEXT: s_load_b32 s0, s[4:5], 0x44 +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s2, s0, s2 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -585,19 +581,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX6-NEXT: s_mov_b32 m0, s5 -; GFX6-NEXT: v_readlane_b32 s8, v0, s5 -; GFX6-NEXT: v_writelane_b32 v1, s4, m0 -; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s3 +; GFX6-NEXT: v_readlane_b32 s8, v0, s3 +; GFX6-NEXT: v_writelane_b32 v1, s2, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_add_i32 s2, s2, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -608,14 +604,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB2_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB2_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -629,16 +625,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 @@ -651,14 +647,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB2_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -671,16 +667,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9-NEXT: s_mov_b32 m0, s3 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 @@ -693,14 +689,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB2_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -712,16 +708,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec -; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 ; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -733,17 +729,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -758,12 +753,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 -; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 ; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -774,17 +769,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -796,18 +790,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -821,14 +815,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB2_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -846,14 +840,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -865,13 +859,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB2_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -884,18 +878,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: s_mov_b32 s4, 0 +; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -909,18 +903,19 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -936,14 +931,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 @@ -957,14 +952,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -982,24 +977,25 @@ entry: define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vindex: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_varying_vindex: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 idxen glc +; GFX8-NEXT: buffer_atomic_add v2, v0, s[0:3], 0 idxen glc +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1008,80 +1004,78 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_vindex: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc +; GFX9-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen glc +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add_i32_varying_vindex: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc +; GFX10-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vindex: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], 0 idxen glc +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vindex: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], 0 idxen glc +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vindex: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vindex: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_endpgm entry: @@ -1095,29 +1089,30 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 idxen offen glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s0, 0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b32 s6, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX8-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 idxen offen glc +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1126,88 +1121,89 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX9-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 idxen offen glc +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX10-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 idxen offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_offset: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_mov_b32 s0, 0 +; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11W64-NEXT: s_mov_b32 s6, 0 ; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[0:3], 0 idxen offen glc +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_offset: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11W32-NEXT: s_mov_b32 s6, 0 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v1, 0x3ff, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_and_b32 v1, 0x3ff, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[0:3], 0 idxen offen glc +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_offset: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_offset: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12W32-NEXT: s_endpgm entry: @@ -1220,24 +1216,24 @@ entry: define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX6-NEXT: s_mul_i32 s4, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_mul_i32 s2, s2, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1250,24 +1246,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1280,24 +1276,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1309,27 +1305,26 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W64-LABEL: sub_i32_constant: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_mul_i32 s2, s2, 5 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1347,19 +1342,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1370,26 +1364,26 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX11W64-NEXT: s_mul_i32 s2, s2, 5 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB5_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1410,17 +1404,17 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB5_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1433,28 +1427,28 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W64-LABEL: sub_i32_constant: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_mov_b64 s[4:5], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX12W64-NEXT: s_mul_i32 s2, s2, 5 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1475,7 +1469,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe @@ -1483,11 +1477,11 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1506,25 +1500,25 @@ entry: define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %subitive) { ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x11 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB6_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s4, s6, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_mul_i32 s2, s6, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB6_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1537,25 +1531,25 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s6, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1568,25 +1562,25 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1598,26 +1592,26 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 -; GFX10W64-NEXT: s_mov_b64 s[4:5], exec +; GFX10W64-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: s_mul_i32 s2, s6, s2 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) @@ -1629,57 +1623,57 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 -; GFX10W32-NEXT: s_mov_b32 s4, exec_lo +; GFX10W32-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX10W32-NEXT: s_mov_b32 s2, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W32-NEXT: s_mul_i32 s2, s0, s2 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 -; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_load_b32 s6, s[4:5], 0x44 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX11W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB6_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) @@ -1692,26 +1686,26 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 -; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: s_load_b32 s0, s[4:5], 0x44 +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo ; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s2, s0, s2 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB6_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) @@ -1724,29 +1718,29 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W64-LABEL: sub_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 -; GFX12W64-NEXT: s_mov_b64 s[4:5], exec +; GFX12W64-NEXT: s_load_b32 s6, s[4:5], 0x44 +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX12W64-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX12W64-NEXT: s_mul_i32 s2, s6, s2 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1759,27 +1753,27 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: sub_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 -; GFX12W32-NEXT: s_mov_b32 s4, exec_lo +; GFX12W32-NEXT: s_load_b32 s0, s[4:5], 0x44 +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo ; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s2, s0, s2 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1799,19 +1793,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB7_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX6-NEXT: s_mov_b32 m0, s5 -; GFX6-NEXT: v_readlane_b32 s8, v0, s5 -; GFX6-NEXT: v_writelane_b32 v1, s4, m0 -; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX6-NEXT: s_mov_b32 m0, s3 +; GFX6-NEXT: v_readlane_b32 s8, v0, s3 +; GFX6-NEXT: v_writelane_b32 v1, s2, m0 +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] -; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: s_add_i32 s2, s2, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -1822,14 +1816,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB7_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB7_4: ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1843,16 +1837,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: v_readlane_b32 s8, v0, s5 -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX8-NEXT: v_writelane_b32 v1, s4, m0 -; GFX8-NEXT: s_add_i32 s4, s4, s8 +; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8-NEXT: s_mov_b32 m0, s3 +; GFX8-NEXT: v_readlane_b32 s8, v0, s3 +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX8-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8-NEXT: s_add_i32 s2, s2, s8 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 @@ -1865,14 +1859,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB7_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB7_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 @@ -1885,16 +1879,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX9-NEXT: s_mov_b32 m0, s5 -; GFX9-NEXT: v_readlane_b32 s8, v0, s5 -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX9-NEXT: v_writelane_b32 v1, s4, m0 -; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX9-NEXT: s_mov_b32 m0, s3 +; GFX9-NEXT: v_readlane_b32 s8, v0, s3 +; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX9-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9-NEXT: s_add_i32 s2, s2, s8 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 @@ -1907,14 +1901,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB7_4: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1926,16 +1920,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry ; GFX10W64-NEXT: s_mov_b64 s[0:1], exec -; GFX10W64-NEXT: s_mov_b32 s4, 0 +; GFX10W64-NEXT: s_mov_b32 s2, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 -; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3 +; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3 ; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX10W64-NEXT: s_add_i32 s4, s4, s8 +; GFX10W64-NEXT: s_add_i32 s2, s2, s8 ; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -1947,17 +1941,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1972,12 +1965,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 -; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 -; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 +; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2 +; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2 ; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 +; GFX10W32-NEXT: s_add_i32 s0, s0, s3 ; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -1988,17 +1981,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v0, v2, s[4:7], 0 idxen glc +; GFX10W32-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -2010,18 +2002,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11W64-NEXT: s_mov_b64 s[0:1], exec -; GFX11W64-NEXT: s_mov_b32 s4, 0 +; GFX11W64-NEXT: s_mov_b32 s2, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 ; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: s_add_i32 s4, s4, s8 +; GFX11W64-NEXT: s_add_i32 s2, s2, s8 ; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -2035,14 +2027,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB7_4: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -2060,14 +2052,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: ; implicit-def: $vgpr0 ; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 +; GFX11W32-NEXT: s_add_i32 s0, s0, s3 ; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd @@ -2079,13 +2071,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB7_4: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -2099,18 +2091,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12W64-NEXT: s_mov_b64 s[0:1], exec -; GFX12W64-NEXT: s_mov_b32 s4, 0 +; GFX12W64-NEXT: s_mov_b32 s2, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr0 ; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] +; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1] ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 -; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 +; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3 +; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3 ; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 +; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8 ; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd @@ -2124,18 +2116,19 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_alu 0xfffe -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v1, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB7_4: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: s_wait_alu 0xfffe ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -2151,14 +2144,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 +; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1 ; GFX12W32-NEXT: s_wait_alu 0xfffe -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 -; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 +; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2 +; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2 ; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 +; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 @@ -2172,14 +2165,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB7_4: ; GFX12W32-NEXT: s_wait_alu 0xfffe ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -2198,24 +2191,25 @@ entry: define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vindex: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_varying_vindex: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc +; GFX8-NEXT: buffer_atomic_sub v2, v0, s[0:3], 0 idxen glc +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2224,80 +2218,78 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_vindex: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc +; GFX9-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen glc +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sub_i32_varying_vindex: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc +; GFX10-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vindex: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], 0 idxen glc +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vindex: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], 0 idxen glc +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vindex: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vindex: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_endpgm entry: @@ -2311,29 +2303,30 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 idxen offen glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s0, 0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b32 s6, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX8-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 idxen offen glc +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2342,88 +2335,89 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX9-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 idxen offen glc +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sub_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX10-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 idxen offen glc +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_offset: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_mov_b32 s0, 0 +; GFX11W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11W64-NEXT: s_mov_b32 s6, 0 ; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[0:3], 0 idxen offen glc +; GFX11W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_offset: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11W32-NEXT: s_mov_b32 s6, 0 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v1, 0x3ff, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_and_b32 v1, 0x3ff, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[0:3], 0 idxen offen glc +; GFX11W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_offset: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 +; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_offset: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 +; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll index 2a019e4712740c..d0313267b56d71 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll @@ -20,7 +20,7 @@ main_body: } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_agent: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_agent(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -29,7 +29,7 @@ main_body: } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wg: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_wg(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -38,7 +38,7 @@ main_body: } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wavefront: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_wavefront(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -47,7 +47,7 @@ main_body: } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_single_thread: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_single_thread(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -56,7 +56,7 @@ main_body: } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_aoa: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_aoa(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -65,7 +65,7 @@ main_body: } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wgoa: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_wgoa(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -74,7 +74,7 @@ main_body: } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wfoa: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_wfoa(ptr addrspace(1) %ptr, float %val) #0 { main_body: @@ -83,7 +83,7 @@ main_body: } ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_stoa: -; GFX90A-HW: global_atomic_add_f32 v0, v1, s[4:5] +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] ; GFX90A-HW: s_endpgm define amdgpu_kernel void @atomic_add_unsafe_hw_stoa(ptr addrspace(1) %ptr, float %val) #0 { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index c630effa4b048a..e74fd21365c9d7 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -18,7 +18,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 @@ -33,7 +33,7 @@ entry: define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -42,7 +42,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 @@ -58,13 +58,13 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-SDAG-NEXT: s_endpgm @@ -72,13 +72,13 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-GISEL-NEXT: s_endpgm @@ -92,7 +92,7 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN @@ -100,7 +100,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %a ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN @@ -114,7 +114,7 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 @@ -122,7 +122,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspac ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 @@ -136,26 +136,26 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 -; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 -; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4 @@ -167,7 +167,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) { ; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -177,7 +177,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i ; ; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -193,7 +193,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -203,7 +203,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, ; ; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -219,7 +219,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) { ; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16 @@ -230,7 +230,7 @@ define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ; ; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index d9ce1e4efe0e50..9a647f04d43da6 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -27662,7 +27662,7 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_and_b32 s4, s6, 0x80000000 +; GCN-NEXT: s_and_b32 s4, s16, 0x80000000 ; GCN-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_or_b32_e32 v0, s4, v0 @@ -27673,7 +27673,7 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_and_b32 s4, s6, 0x80000000 +; GFX7-NEXT: s_and_b32 s4, s16, 0x80000000 ; GFX7-NEXT: s_lshr_b32 s4, s4, 16 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v0, s4, v0 @@ -27684,7 +27684,7 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -27692,14 +27692,14 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_bf16_s_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s6 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_copysign_bf16_s_bf16: @@ -27715,7 +27715,7 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GCN-LABEL: v_copysign_s_bf16_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 @@ -27726,7 +27726,7 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GFX7-LABEL: v_copysign_s_bf16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 @@ -27738,7 +27738,7 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -27746,14 +27746,14 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_s_bf16_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s6, v0 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_copysign_s_bf16_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll index 2c179de2a9c35c..3bf3e47f546f43 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll @@ -6,10 +6,10 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) { ; VI-LABEL: bfe_combine8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -24,11 +24,11 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) ; ; VI-SDWA-LABEL: bfe_combine8: ; VI-SDWA: ; %bb.0: -; VI-SDWA-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDWA-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, 2 ; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -42,13 +42,13 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) ; ; CI-LABEL: bfe_combine8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s2, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 6, v0 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: v_and_b32_e32 v0, 0x3fc, v0 @@ -71,11 +71,11 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x) { ; VI-LABEL: bfe_combine16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 16, 16 ; VI-NEXT: v_lshlrev_b32_e32 v0, 15, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -91,11 +91,11 @@ define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x ; ; VI-SDWA-LABEL: bfe_combine16: ; VI-SDWA: ; %bb.0: -; VI-SDWA-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDWA-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, 15 ; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, 0 ; VI-SDWA-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -111,13 +111,13 @@ define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x ; ; CI-LABEL: bfe_combine16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s2, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; CI-NEXT: v_and_b32_e32 v0, 0x7fff8000, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index bdba8c57dc745d..18d19673995115 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_ubfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -24,7 +24,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -58,7 +58,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -82,7 +82,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; ; VI-LABEL: v_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -121,7 +121,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_ubfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -137,7 +137,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -161,7 +161,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -181,7 +181,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -209,7 +209,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_sbfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -228,7 +228,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -262,7 +262,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -285,7 +285,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; ; VI-LABEL: v_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -322,7 +322,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_sbfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -338,7 +338,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -362,7 +362,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -382,7 +382,7 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -410,31 +410,31 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_sbfe_or_shl_shl_uniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[6:7], 0x0 -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_or_b32 s0, s2, s0 -; SI-NEXT: s_bfe_i32 s0, s0, 0xf0000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_or_b32 s2, s2, s4 +; SI-NEXT: s_bfe_i32 s4, s2, 0xf0000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s0, s2, s0 +; VI-NEXT: s_or_b32 s0, s2, s3 ; VI-NEXT: s_bfe_i32 s0, s0, 0xf0000 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -453,35 +453,35 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { ; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[6:7], 0x0 -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s1, s2, 17 -; SI-NEXT: s_lshl_b32 s0, s0, 19 -; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: s_ashr_i32 s0, s0, 17 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_lshl_b32 s2, s2, 17 +; SI-NEXT: s_lshl_b32 s4, s4, 19 +; SI-NEXT: s_or_b32 s2, s2, s4 +; SI-NEXT: s_ashr_i32 s4, s2, 17 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s1, s2, 17 -; VI-NEXT: s_lshl_b32 s0, s0, 19 -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_lshl_b32 s0, s2, 17 +; VI-NEXT: s_lshl_b32 s1, s3, 19 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: s_ashr_i32 s0, s0, 17 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -500,35 +500,35 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { ; SI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[6:7], 0x0 -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s1, s2, 17 -; SI-NEXT: s_lshl_b32 s0, s0, 16 -; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: s_ashr_i32 s0, s0, 17 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_lshl_b32 s2, s2, 17 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s2, s2, s4 +; SI-NEXT: s_ashr_i32 s4, s2, 17 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[6:7], 0x0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s1, s2, 17 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_lshl_b32 s0, s2, 17 +; VI-NEXT: s_lshl_b32 s1, s3, 16 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: s_ashr_i32 s0, s0, 17 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll index 78d764898a3b93..c67d1e1c817da4 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -11,72 +11,72 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_def_i32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_andn2_b32 s6, s6, s4 -; GFX7-NEXT: s_and_b32 s4, s5, s4 -; GFX7-NEXT: s_or_b32 s4, s6, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_andn2_b32 s2, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s1, s0 +; GFX7-NEXT: s_or_b32 s0, s2, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_def_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_andn2_b32 s2, s6, s4 -; GFX8-NEXT: s_and_b32 s3, s5, s4 -; GFX8-NEXT: s_or_b32 s2, s2, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_andn2_b32 s2, s2, s0 +; GFX8-NEXT: s_and_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_def_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s2, s6, s4 -; GFX10-NEXT: s_and_b32 s3, s5, s4 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_andn2_b32 s2, s2, s0 +; GFX10-NEXT: s_and_b32 s0, s1, s0 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_def_i32: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-GISEL-NEXT: s_and_b32 s1, s7, s6 -; GFX8-GISEL-NEXT: s_andn2_b32 s0, s0, s6 -; GFX8-GISEL-NEXT: s_or_b32 s0, s0, s1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-GISEL-NEXT: s_andn2_b32 s4, s4, s2 +; GFX8-GISEL-NEXT: s_and_b32 s2, s3, s2 +; GFX8-GISEL-NEXT: s_or_b32 s2, s4, s2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8-GISEL-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_bfi_def_i32: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s6 -; GFX10-GISEL-NEXT: s_andn2_b32 s0, s0, s6 -; GFX10-GISEL-NEXT: s_or_b32 s0, s0, s1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: s_andn2_b32 s4, s4, s2 +; GFX10-GISEL-NEXT: s_and_b32 s2, s3, s2 +; GFX10-GISEL-NEXT: s_or_b32 s2, s4, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm entry: %0 = xor i32 %x, -1 @@ -130,72 +130,72 @@ entry: define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_sha256_ch: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b32 s5, s5, s6 -; GFX7-NEXT: s_and_b32 s4, s4, s5 -; GFX7-NEXT: s_xor_b32 s4, s6, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_xor_b32 s1, s1, s2 +; GFX7-NEXT: s_and_b32 s0, s0, s1 +; GFX7-NEXT: s_xor_b32 s0, s2, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_sha256_ch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s2, s5, s6 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_xor_b32 s2, s6, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_xor_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s0, s0, s1 +; GFX8-NEXT: s_xor_b32 s0, s2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_sha256_ch: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b32 s2, s5, s6 -; GFX10-NEXT: s_and_b32 s2, s4, s2 -; GFX10-NEXT: s_xor_b32 s2, s6, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_xor_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s0, s0, s1 +; GFX10-NEXT: s_xor_b32 s0, s2, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ch: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-GISEL-NEXT: s_xor_b32 s1, s7, s0 -; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s1 -; GFX8-GISEL-NEXT: s_xor_b32 s0, s0, s1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-GISEL-NEXT: s_xor_b32 s3, s3, s4 +; GFX8-GISEL-NEXT: s_and_b32 s2, s2, s3 +; GFX8-GISEL-NEXT: s_xor_b32 s2, s4, s2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8-GISEL-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_bfi_sha256_ch: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_xor_b32 s1, s7, s0 -; GFX10-GISEL-NEXT: s_and_b32 s1, s6, s1 -; GFX10-GISEL-NEXT: s_xor_b32 s0, s0, s1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: s_xor_b32 s3, s3, s4 +; GFX10-GISEL-NEXT: s_and_b32 s2, s2, s3 +; GFX10-GISEL-NEXT: s_xor_b32 s2, s4, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm entry: %0 = xor i32 %y, %z @@ -454,77 +454,77 @@ entry: define amdgpu_kernel void @s_bfi_sha256_ma(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_sha256_ma: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s6 -; GFX7-NEXT: s_or_b32 s4, s4, s6 -; GFX7-NEXT: s_and_b32 s4, s5, s4 -; GFX7-NEXT: s_or_b32 s4, s7, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_and_b32 s3, s0, s2 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s0, s1, s0 +; GFX7-NEXT: s_or_b32 s0, s3, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_sha256_ma: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b32 s3, s4, s6 -; GFX8-NEXT: s_and_b32 s2, s4, s6 -; GFX8-NEXT: s_and_b32 s3, s5, s3 -; GFX8-NEXT: s_or_b32 s2, s2, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_and_b32 s3, s0, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s3, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_sha256_ma: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_or_b32 s2, s4, s6 -; GFX10-NEXT: s_and_b32 s3, s4, s6 -; GFX10-NEXT: s_and_b32 s2, s5, s2 -; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_or_b32 s3, s0, s2 +; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ma: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s0 -; GFX8-GISEL-NEXT: s_or_b32 s0, s6, s0 -; GFX8-GISEL-NEXT: s_and_b32 s0, s7, s0 -; GFX8-GISEL-NEXT: s_or_b32 s0, s1, s0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-GISEL-NEXT: s_and_b32 s5, s2, s4 +; GFX8-GISEL-NEXT: s_or_b32 s2, s2, s4 +; GFX8-GISEL-NEXT: s_and_b32 s2, s3, s2 +; GFX8-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8-GISEL-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_bfi_sha256_ma: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_or_b32 s1, s6, s0 -; GFX10-GISEL-NEXT: s_and_b32 s0, s6, s0 -; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s1 -; GFX10-GISEL-NEXT: s_or_b32 s0, s0, s1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: s_or_b32 s5, s2, s4 +; GFX10-GISEL-NEXT: s_and_b32 s2, s2, s4 +; GFX10-GISEL-NEXT: s_and_b32 s3, s3, s5 +; GFX10-GISEL-NEXT: s_or_b32 s2, s2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm entry: %0 = and i32 %x, %z @@ -1402,28 +1402,28 @@ entry: define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX7-LABEL: s_bitselect_i64_pat_0: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] -; GFX7-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX7-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; GFX7-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; GFX7-NEXT: s_andn2_b64 s[0:1], s[4:5], s[0:1] +; GFX7-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX7-NEXT: s_add_u32 s0, s0, 10 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bitselect_i64_pat_0: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; GFX8-NEXT: s_andn2_b64 s[0:1], s[4:5], s[0:1] ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, 10 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -1435,11 +1435,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX10-LABEL: s_bitselect_i64_pat_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] -; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; GFX10-NEXT: s_andn2_b64 s[0:1], s[4:5], s[0:1] ; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX10-NEXT: s_add_u32 s0, s0, 10 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -1450,11 +1450,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_0: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] -; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[4:5], s[0:1] ; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10 ; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -1466,11 +1466,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_0: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] -; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[4:5], s[0:1] ; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -1490,29 +1490,29 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX7-LABEL: s_bitselect_i64_pat_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] -; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX7-NEXT: s_add_u32 s0, s0, 10 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bitselect_i64_pat_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] -; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s0, s0, 10 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1523,12 +1523,12 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX10-LABEL: s_bitselect_i64_pat_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] -; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] -; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_add_u32 s0, s0, 10 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1538,12 +1538,12 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_1: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] -; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] -; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10 ; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1554,12 +1554,12 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] -; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] -; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1578,29 +1578,29 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX7-LABEL: s_bitselect_i64_pat_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] -; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX7-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX7-NEXT: s_add_u32 s0, s0, 10 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bitselect_i64_pat_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] -; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] -; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s0, s0, 10 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1611,12 +1611,12 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX10-LABEL: s_bitselect_i64_pat_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] -; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] -; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_add_u32 s0, s0, 10 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1626,12 +1626,12 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_2: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] -; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] -; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10 ; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1642,12 +1642,12 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_2: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] -; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] -; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1666,31 +1666,31 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; GFX7-LABEL: s_bfi_sha256_ma_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b64 s[8:9], s[4:5], s[0:1] -; GFX7-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; GFX7-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] +; GFX7-NEXT: s_and_b64 s[8:9], s[0:1], s[4:5] +; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX7-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GFX7-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] ; GFX7-NEXT: s_add_u32 s0, s0, 10 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_sha256_ma_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1] -; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; GFX8-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: s_and_b64 s[6:7], s[0:1], s[4:5] +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, 10 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1701,12 +1701,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; GFX10-LABEL: s_bfi_sha256_ma_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1] -; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] -; GFX10-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] +; GFX10-NEXT: s_or_b64 s[6:7], s[0:1], s[4:5] +; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_add_u32 s0, s0, 10 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -1717,13 +1717,13 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ma_i64: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1] -; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] -; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX8-GISEL-NEXT: s_and_b64 s[6:7], s[0:1], s[4:5] +; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] ; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10 ; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1734,12 +1734,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; GFX10-GISEL-LABEL: s_bfi_sha256_ma_i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1] -; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] -; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] +; GFX10-GISEL-NEXT: s_or_b64 s[6:7], s[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll index 4ad3667f689583..4b38215ebc597d 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll @@ -283,7 +283,7 @@ define float @v_bfi_single_constant_as_partition(float %x, float %y, float %z) { define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: v_bfi_dont_applied_for_scalar_ops: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll index 2e64db12ef564c..d287d0082cdc58 100644 --- a/llvm/test/CodeGen/AMDGPU/bfm.ll +++ b/llvm/test/CodeGen/AMDGPU/bfm.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) #0 { ; SI-LABEL: s_bfm_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfm_b32 s2, s2, s3 @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) ; ; VI-LABEL: s_bfm_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfm_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -36,11 +36,11 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) #0 { ; SI-LABEL: s_bfm_pattern_simple: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfm_b32 s4, s4, 0 +; SI-NEXT: s_bfm_b32 s4, s2, 0 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -48,10 +48,10 @@ define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) # ; ; VI-LABEL: s_bfm_pattern_simple: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfm_b32 s2, s4, 0 +; VI-NEXT: s_bfm_b32 s2, s2, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index b281c1bf3f9c48..bb7974335bf284 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -21,12 +21,12 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #0 { ; SI-LABEL: s_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s4, s4 +; SI-NEXT: s_brev_b32 s4, s6 ; SI-NEXT: s_lshr_b32 s4, s4, 16 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -34,12 +34,12 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; ; FLAT-LABEL: s_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dword s4, s[2:3], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_load_dword s6, s[4:5], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b32 s4, s4 +; FLAT-NEXT: s_brev_b32 s4, s6 ; FLAT-NEXT: s_lshr_b32 s4, s4, 16 ; FLAT-NEXT: v_mov_b32_e32 v0, s4 ; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -47,10 +47,10 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; ; GISEL-LABEL: s_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s2, s4, 0xffff +; GISEL-NEXT: s_and_b32 s2, s2, 0xffff ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -62,10 +62,10 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-FLAT-LABEL: s_brev_i16: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s4 +; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] @@ -74,11 +74,11 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-GISEL-LABEL: s_brev_i16: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 ; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16 @@ -94,7 +94,7 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -113,7 +113,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_mov_b32 s10, s6 @@ -132,7 +132,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -147,7 +147,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i16: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: v_mov_b32_e32 v1, 0 @@ -162,7 +162,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] @@ -179,34 +179,34 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #0 { ; SI-LABEL: s_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s4, s4 +; SI-NEXT: s_brev_b32 s4, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dword s4, s[2:3], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_load_dword s6, s[4:5], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b32 s4, s4 +; FLAT-NEXT: s_brev_b32 s4, s6 ; FLAT-NEXT: v_mov_b32_e32 v0, s4 ; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: s_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_brev_b32 s2, s4 +; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -216,11 +216,11 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-FLAT-LABEL: s_brev_i32: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s4 +; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 @@ -230,11 +230,11 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-GISEL-LABEL: s_brev_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_brev_b32 s2, s4 +; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -247,7 +247,7 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -266,7 +266,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -299,7 +299,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -314,7 +314,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -335,7 +335,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> %val) #0 { ; SI-LABEL: s_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -350,7 +350,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; FLAT-LABEL: s_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -365,7 +365,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GISEL-LABEL: s_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_brev_b32 s3, s3 @@ -378,7 +378,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GFX11-FLAT-LABEL: s_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -393,7 +393,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GFX11-GISEL-LABEL: s_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 @@ -410,7 +410,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -430,7 +430,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -447,7 +447,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GISEL-LABEL: v_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -465,7 +465,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -481,7 +481,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-GISEL-LABEL: v_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -504,7 +504,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #0 { ; SI-LABEL: s_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -518,7 +518,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; FLAT-LABEL: s_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -532,7 +532,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GISEL-LABEL: s_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -544,7 +544,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GFX11-FLAT-LABEL: s_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[2:3] ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 @@ -555,7 +555,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GFX11-GISEL-LABEL: s_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] @@ -571,7 +571,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -591,7 +591,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -608,7 +608,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -626,7 +626,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -642,7 +642,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -665,79 +665,79 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) #0 { ; SI-LABEL: s_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b64 s[6:7], s[6:7] -; SI-NEXT: s_brev_b64 s[4:5], s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_brev_b64 s[2:3], s[2:3] +; SI-NEXT: s_brev_b64 s[0:1], s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; FLAT-NEXT: s_mov_b32 s3, 0xf000 -; FLAT-NEXT: s_mov_b32 s2, -1 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; FLAT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b64 s[6:7], s[6:7] -; FLAT-NEXT: s_brev_b64 s[4:5], s[4:5] -; FLAT-NEXT: v_mov_b32_e32 v0, s4 -; FLAT-NEXT: v_mov_b32_e32 v1, s5 -; FLAT-NEXT: v_mov_b32_e32 v2, s6 -; FLAT-NEXT: v_mov_b32_e32 v3, s7 -; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; FLAT-NEXT: s_brev_b64 s[2:3], s[2:3] +; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1] +; FLAT-NEXT: v_mov_b32_e32 v0, s0 +; FLAT-NEXT: v_mov_b32_e32 v1, s1 +; FLAT-NEXT: v_mov_b32_e32 v2, s2 +; FLAT-NEXT: v_mov_b32_e32 v3, s3 +; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: s_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] -; GISEL-NEXT: s_brev_b64 s[2:3], s[6:7] +; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1] +; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v4, s8 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v3, s3 -; GISEL-NEXT: v_mov_b32_e32 v5, s9 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 ; GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GISEL-NEXT: s_endpgm ; ; GFX11-FLAT-LABEL: s_brev_v2i64: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-FLAT-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[4:5] -; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[6:7] -; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-FLAT-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 -; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-FLAT-NEXT: s_brev_b64 s[0:1], s[0:1] +; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[2:3] +; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FLAT-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: s_brev_v2i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] -; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[6:7] +; GFX11-GISEL-NEXT: s_brev_b64 s[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-GISEL-NEXT: global_store_b128 v4, v[0:3], s[8:9] +; GFX11-GISEL-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-GISEL-NEXT: s_endpgm %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 store <2 x i64> %brev, ptr addrspace(1) %out @@ -747,7 +747,7 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -769,7 +769,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -788,7 +788,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GISEL-LABEL: v_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -808,7 +808,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -826,7 +826,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-GISEL-LABEL: v_brev_v2i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll index 6dfc832ff3ac9f..f9ffa5ae57f3ed 100644 --- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -4,16 +4,15 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %extractVec358.i.i, i32 %cmp5.i.i.arg, float %i1.i, i32 %cmp221.i.i.arg, i32 %cmp262.i.i.arg, ptr addrspace(1) %arg) { ; CHECK-LABEL: blender_no_live_segment_at_def_error: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_load_dwordx8 s[36:43], s[6:7], 0x0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_load_dwordx8 s[36:43], s[8:9], 0x0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: s_mov_b32 s12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s40, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_8 @@ -22,50 +21,54 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_cbranch_scc1 .LBB0_4 ; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i ; CHECK-NEXT: s_cmp_lg_u32 s43, 0 -; CHECK-NEXT: s_mov_b32 s15, 0 -; CHECK-NEXT: s_cselect_b32 s8, -1, 0 -; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s8 +; CHECK-NEXT: s_mov_b32 s17, 0 +; CHECK-NEXT: s_cselect_b32 s12, -1, 0 +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12 ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: s_mov_b32 s36, 0 -; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 +; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12 ; CHECK-NEXT: s_cbranch_vccz .LBB0_6 ; CHECK-NEXT: s_branch .LBB0_7 ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: s_mov_b32 s10, s8 -; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: s_mov_b32 s9, s8 -; CHECK-NEXT: s_mov_b64 s[38:39], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b32 s14, s12 +; CHECK-NEXT: s_mov_b32 s15, s12 +; CHECK-NEXT: s_mov_b32 s13, s12 +; CHECK-NEXT: s_mov_b64 s[38:39], s[14:15] +; CHECK-NEXT: s_mov_b64 s[36:37], s[12:13] ; CHECK-NEXT: s_branch .LBB0_7 ; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i -; CHECK-NEXT: v_cmp_lt_f32_e64 s8, s41, 0 +; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s41, 0 ; CHECK-NEXT: s_mov_b32 s36, 1.0 -; CHECK-NEXT: s_mov_b32 s15, 0x7fc00000 +; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000 ; CHECK-NEXT: s_mov_b32 s37, s36 ; CHECK-NEXT: s_mov_b32 s38, s36 ; CHECK-NEXT: s_mov_b32 s39, s36 -; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 +; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_7 ; CHECK-NEXT: .LBB0_6: ; %if.end273.i.i -; CHECK-NEXT: s_add_u32 s8, s6, 40 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z3dotDv3_fS_@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z3dotDv3_fS_@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s12, s8, 40 +; CHECK-NEXT: s_addc_u32 s13, s9, 0 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, _Z3dotDv3_fS_@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, _Z3dotDv3_fS_@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v3, 10, v1 -; CHECK-NEXT: v_add_f32_e64 v1, s15, s36 -; CHECK-NEXT: s_mov_b32 s36, 0 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] +; CHECK-NEXT: v_add_f32_e64 v1, s17, s36 +; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] +; CHECK-NEXT: s_mov_b64 s[8:9], s[12:13] +; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: v_or3_b32 v31, v0, v3, v2 ; CHECK-NEXT: v_mov_b32_e32 v0, v1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s14, s16 +; CHECK-NEXT: s_mov_b32 s36, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_mov_b64 s[8:9], s[34:35] ; CHECK-NEXT: s_mov_b32 s37, s36 ; CHECK-NEXT: s_mov_b32 s38, s36 ; CHECK-NEXT: s_mov_b32 s39, s36 @@ -76,7 +79,7 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CHECK-NEXT: .LBB0_8: ; %kernel_direct_lighting.exit -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x20 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20 ; CHECK-NEXT: v_mov_b32_e32 v0, s36 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s37 diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll index afff56c7f0eb23..98832aaa3bc255 100644 --- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -6,18 +6,18 @@ define amdgpu_kernel void @br_cc_f16( ; SI-LABEL: br_cc_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -29,62 +29,62 @@ define amdgpu_kernel void @br_cc_f16( ; SI-NEXT: .LBB0_2: ; %two ; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: .LBB0_3: ; %one -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: br_cc_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_cbranch_vccnz .LBB0_2 ; VI-NEXT: ; %bb.1: ; %one -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB0_2: ; %two -; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: br_cc_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[8:9], s[2:3], 0x34 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s2 -; GFX11-NEXT: s_mov_b32 s11, s3 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s6 -; GFX11-NEXT: s_mov_b32 s1, s7 -; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_mov_b32 s4, s2 +; GFX11-NEXT: s_mov_b32 s5, s3 +; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s6, s2 -; GFX11-NEXT: s_mov_b32 s7, s3 +; GFX11-NEXT: s_mov_b32 s2, s6 +; GFX11-NEXT: s_mov_b32 s3, s7 ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_cbranch_vccnz .LBB0_2 ; GFX11-NEXT: ; %bb.1: ; %one -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB0_2: ; %two -; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -107,7 +107,7 @@ two: define amdgpu_kernel void @br_cc_f16_imm_a( ; SI-LABEL: br_cc_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -133,7 +133,7 @@ define amdgpu_kernel void @br_cc_f16_imm_a( ; ; VI-LABEL: br_cc_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -153,7 +153,7 @@ define amdgpu_kernel void @br_cc_f16_imm_a( ; ; GFX11-LABEL: br_cc_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -189,7 +189,7 @@ two: define amdgpu_kernel void @br_cc_f16_imm_b( ; SI-LABEL: br_cc_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -215,7 +215,7 @@ define amdgpu_kernel void @br_cc_f16_imm_b( ; ; VI-LABEL: br_cc_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -237,7 +237,7 @@ define amdgpu_kernel void @br_cc_f16_imm_b( ; ; GFX11-LABEL: br_cc_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 055e9850de3d68..98136347ab702c 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -5,91 +5,90 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-LABEL: name: f1 ; GFX90A: bb.0.bb: ; GFX90A-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr17, $sgpr12_sgpr13 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr32 = S_MOV_B32 0 - ; GFX90A-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc - ; GFX90A-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc - ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: $flat_scr_lo = S_ADD_U32 $sgpr12, $sgpr17, implicit-def $scc + ; GFX90A-NEXT: $flat_scr_hi = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc + ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9 ; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) - ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) + ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr24_sgpr25 = S_XOR_B64 renamable $sgpr8_sgpr9, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 8, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr26_sgpr27 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr26_sgpr27 = S_XOR_B64 killed renamable $sgpr26_sgpr27, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3) - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr24_sgpr25, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr28_sgpr29, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.2, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: - ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.59, implicit $vcc + ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.58, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54, $sgpr55, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 ; GFX90A-NEXT: renamable $vgpr23 = IMPLICIT_DEF implicit-def $vgpr22 ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF implicit-def $vgpr24 - ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: - ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.58(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.58, implicit $vcc + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc + ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.57, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec - ; GFX90A-NEXT: renamable $vgpr4 = COPY renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr16, $vgpr0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4 = COPY renamable $sgpr25, implicit $exec + ; GFX90A-NEXT: renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr24, $vgpr0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr4, killed $vgpr1, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 2, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr46, killed $vgpr0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr47, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.35, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -108,120 +107,123 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr19 = COPY renamable $sgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18 = COPY $sgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr21 = COPY $sgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr20 = COPY $sgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr23 = COPY $sgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr22 = COPY $sgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr25 = COPY $sgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr24 = COPY $sgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr19 = COPY renamable $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr18 = COPY $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr21 = COPY $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr20 = COPY $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr23 = COPY $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr22 = COPY $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr25 = COPY $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr24 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: - ; GFX90A-NEXT: successors: %bb.63(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 - ; GFX90A-NEXT: $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $sgpr28_sgpr29, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.63, implicit $exec + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.62, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.8.Flow32: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr18_sgpr19, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.10, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.9.bb89: ; GFX90A-NEXT: successors: %bb.10(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr58_sgpr59, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: ; GFX90A-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr16_sgpr17, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.16, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.15.bb72: ; GFX90A-NEXT: successors: %bb.16(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr8 = S_ADD_U32 renamable $sgpr6, 48, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr7, 0, implicit-def dead $scc, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @f2, target-flags(amdgpu-gotprel32-hi) @f2, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM killed renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) - ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit undef $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr8 = S_ADD_U32 renamable $sgpr8, 48, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @f2, target-flags(amdgpu-gotprel32-hi) @f2, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_LOAD_DWORDX2_IMM killed renamable $sgpr12_sgpr13, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; GFX90A-NEXT: $sgpr12 = COPY killed renamable $sgpr14 + ; GFX90A-NEXT: $sgpr13 = COPY killed renamable $sgpr15 + ; GFX90A-NEXT: $sgpr14 = COPY killed renamable $sgpr16 + ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr18_sgpr19, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.16.Flow36: ; GFX90A-NEXT: successors: %bb.17(0x40000000), %bb.18(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.18, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.17.bb67: ; GFX90A-NEXT: successors: %bb.18(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.18.Flow37: ; GFX90A-NEXT: successors: %bb.19(0x40000000), %bb.20(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec @@ -230,15 +232,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.19.bb62: ; GFX90A-NEXT: successors: %bb.20(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.20.Flow38: ; GFX90A-NEXT: successors: %bb.21(0x40000000), %bb.22(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec @@ -247,15 +249,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.21.bb54: ; GFX90A-NEXT: successors: %bb.22(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.22.Flow39: ; GFX90A-NEXT: successors: %bb.23(0x40000000), %bb.24(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec @@ -264,58 +266,58 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.23.bb47: ; GFX90A-NEXT: successors: %bb.24(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.24.Flow40: ; GFX90A-NEXT: successors: %bb.25(0x40000000), %bb.26(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.26, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.25.bb40: ; GFX90A-NEXT: successors: %bb.26(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.26.Flow41: ; GFX90A-NEXT: successors: %bb.27(0x40000000), %bb.28(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.28, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.27.bb33: ; GFX90A-NEXT: successors: %bb.28(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.28.Flow42: ; GFX90A-NEXT: successors: %bb.34(0x40000000), %bb.29(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.34, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.29.Flow43: ; GFX90A-NEXT: successors: %bb.30(0x40000000), %bb.31(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc @@ -323,17 +325,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.30.bb19: ; GFX90A-NEXT: successors: %bb.31(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.31.Flow44: ; GFX90A-NEXT: successors: %bb.32(0x40000000), %bb.33(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr54_sgpr55, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.33, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock: @@ -349,32 +351,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.34.bb26: ; GFX90A-NEXT: successors: %bb.29(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.29 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -392,29 +394,27 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.36.Flow21: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr40_sgpr41 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -431,39 +431,39 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.38.Flow22: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_ANDN2_B64 killed renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_OR_B64 killed renamable $sgpr28_sgpr29, killed renamable $sgpr54_sgpr55, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_ANDN2_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.36 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -481,43 +481,44 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr38_sgpr39, killed renamable $sgpr40_sgpr41, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.38 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: - ; GFX90A-NEXT: successors: %bb.47(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = COPY $vcc - ; GFX90A-NEXT: renamable $vgpr59, dead renamable $sgpr16_sgpr17 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr16_sgpr17, 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc + ; GFX90A-NEXT: renamable $vgpr59, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec :: (load (s8) from %ir.i42, addrspace 1) - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -532,48 +533,47 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.47, implicit $exec + ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.46, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr16_sgpr17, killed renamable $sgpr54_sgpr55, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.40 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: - ; GFX90A-NEXT: successors: %bb.49(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr44_sgpr45 + ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_XOR_B64 renamable $sgpr62_sgpr63, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr46_sgpr47, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.49, implicit $vcc + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.48, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr56, $vgpr47, $vgpr18, $vgpr30, $vgpr31, $vgpr58, $vgpr61, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr57, $vgpr63, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr46, $vgpr45, $vgpr2, $vgpr3, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr60, $vgpr62 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -586,47 +586,41 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.45.Flow26: - ; GFX90A-NEXT: successors: %bb.46(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr16, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr60, $vgpr61, $vgpr62, $vgpr63 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.46.Flow26: - ; GFX90A-NEXT: successors: %bb.48(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.48 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.47.bb48: - ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.48(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr44_sgpr45 + ; GFX90A-NEXT: successors: %bb.47(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: S_BRANCH %bb.47 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.46.bb48: + ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = COPY $vcc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec :: (load (s8) from %ir.i49, addrspace 1) - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr28_sgpr29 - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr16_sgpr17, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -640,71 +634,57 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr16_sgpr17 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.48.Flow25: + ; GFX90A-NEXT: bb.47.Flow25: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr16_sgpr17, implicit-def $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr54_sgpr55, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.42 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.49.bb63: - ; GFX90A-NEXT: successors: %bb.51(0x40000000), %bb.50(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: bb.48.bb63: + ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.51, implicit $vcc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.50: - ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: bb.49: + ; GFX90A-NEXT: successors: %bb.44(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 - ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: S_BRANCH %bb.45 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 + ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.51.bb68: - ; GFX90A-NEXT: successors: %bb.55(0x40000000), %bb.52(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: bb.50.bb68: + ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr46_sgpr47, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.55, implicit $vcc + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.54, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.52: - ; GFX90A-NEXT: successors: %bb.46(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: bb.51: + ; GFX90A-NEXT: successors: %bb.45(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -716,26 +696,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: S_BRANCH %bb.46 + ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.53.bb80: - ; GFX90A-NEXT: successors: %bb.60(0x40000000), %bb.54(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.52.bb80: + ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr15 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc - ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr15, 0, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc + ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr48_sgpr49 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.60, implicit killed $scc + ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr50_sgpr51 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.54: - ; GFX90A-NEXT: successors: %bb.62(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.53: + ; GFX90A-NEXT: successors: %bb.61(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF @@ -745,21 +725,21 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: S_BRANCH %bb.62 + ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: S_BRANCH %bb.61 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.55.bb73: - ; GFX90A-NEXT: successors: %bb.53(0x40000000), %bb.56(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51 + ; GFX90A-NEXT: bb.54.bb73: + ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 - ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr56_sgpr57 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF @@ -770,54 +750,54 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr58_sgpr59 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.53, implicit $exec + ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.52, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.56.Flow29: - ; GFX90A-NEXT: successors: %bb.46(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.55.Flow29: + ; GFX90A-NEXT: successors: %bb.45(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr58_sgpr59, implicit-def $scc - ; GFX90A-NEXT: S_BRANCH %bb.46 + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc + ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.57.bb90: - ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.56.bb90: + ; GFX90A-NEXT: successors: %bb.60(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr62_sgpr63, implicit $exec + ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr54, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr55, killed $vgpr10, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr56, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr10, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr8_sgpr9, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec - ; GFX90A-NEXT: S_BRANCH %bb.61 + ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.58: + ; GFX90A-NEXT: bb.57: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $exec, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $exec, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr15, implicit $exec - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -839,9 +819,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.7 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.59.bb105: + ; GFX90A-NEXT: bb.58.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) @@ -849,26 +829,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr23 = S_MOV_B32 0 - ; GFX90A-NEXT: renamable $sgpr15 = S_MOV_B32 0 + ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.60.bb85: - ; GFX90A-NEXT: successors: %bb.57(0x40000000), %bb.61(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.59.bb85: + ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) - ; GFX90A-NEXT: renamable $sgpr15 = S_MOV_B32 0 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF @@ -877,69 +857,69 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10 - ; GFX90A-NEXT: $sgpr50_sgpr51 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.57, implicit $exec + ; GFX90A-NEXT: $sgpr52_sgpr53 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.61.Flow31: - ; GFX90A-NEXT: successors: %bb.62(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.60.Flow31: + ; GFX90A-NEXT: successors: %bb.61(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.62.Flow30: - ; GFX90A-NEXT: successors: %bb.56(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.61.Flow30: + ; GFX90A-NEXT: successors: %bb.55(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, killed renamable $sgpr54_sgpr55, implicit-def dead $scc - ; GFX90A-NEXT: S_BRANCH %bb.56 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr50_sgpr51, killed renamable $sgpr56_sgpr57, implicit-def dead $scc + ; GFX90A-NEXT: S_BRANCH %bb.55 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.63.bb140: - ; GFX90A-NEXT: successors: %bb.69(0x40000000), %bb.64(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.62.bb140: + ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr26_sgpr27, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.69, implicit $vcc + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.68, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.64.Flow13: - ; GFX90A-NEXT: successors: %bb.65(0x40000000), %bb.67(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.63.Flow13: + ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.67, implicit $vcc + ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.65.bb159: - ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.64.bb159: + ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.68, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc + ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.67, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.66.Flow10: - ; GFX90A-NEXT: successors: %bb.67(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.65.Flow10: + ; GFX90A-NEXT: successors: %bb.66(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_ANDN2_SAVEEXEC_B64 $sgpr8_sgpr9, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.67.Flow14: + ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.68.bb161: - ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.67.bb161: + ; GFX90A-NEXT: successors: %bb.65(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec @@ -954,38 +934,38 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) - ; GFX90A-NEXT: S_BRANCH %bb.66 + ; GFX90A-NEXT: S_BRANCH %bb.65 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.69.bb174: - ; GFX90A-NEXT: successors: %bb.73(0x40000000), %bb.70(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.68.bb174: + ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr20, implicit $exec - ; GFX90A-NEXT: renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr8_sgpr9, implicit $exec + ; GFX90A-NEXT: renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr10, implicit $exec ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 $vgpr36, $vgpr12, implicit $exec - ; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr8_sgpr9, implicit $exec - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.73, implicit $vcc + ; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.70.Flow: - ; GFX90A-NEXT: successors: %bb.71(0x40000000), %bb.72(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.69.Flow: + ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc - ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc + ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc + ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.71.bb186: - ; GFX90A-NEXT: successors: %bb.72(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.70.bb186: + ; GFX90A-NEXT: successors: %bb.71(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr19, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr18, $vgpr2, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec @@ -1008,23 +988,23 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.72.Flow9: - ; GFX90A-NEXT: successors: %bb.64(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.71.Flow9: + ; GFX90A-NEXT: successors: %bb.63(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 0 - ; GFX90A-NEXT: S_BRANCH %bb.64 + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 + ; GFX90A-NEXT: S_BRANCH %bb.63 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.73.bb196: - ; GFX90A-NEXT: successors: %bb.70(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: bb.72.bb196: + ; GFX90A-NEXT: successors: %bb.69(0x80000000) + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec ; GFX90A-NEXT: renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_MOV_B64 0 - ; GFX90A-NEXT: S_BRANCH %bb.70 + ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 0 + ; GFX90A-NEXT: S_BRANCH %bb.69 bb: %i = tail call i32 @llvm.amdgcn.workitem.id.x() %i11 = icmp eq i32 %i, 0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index 0047b6b0ee9348..dd9c9a3699b4ff 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -4,10 +4,10 @@ define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { ; CHECK-LABEL: spill: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s44, s[6:7], 0x2 +; CHECK-NEXT: s_load_dword s44, s[8:9], 0x2 ; CHECK-NEXT: s_mov_b64 s[98:99], s[2:3] ; CHECK-NEXT: s_mov_b64 s[96:97], s[0:1] -; CHECK-NEXT: s_add_u32 s96, s96, s13 +; CHECK-NEXT: s_add_u32 s96, s96, s15 ; CHECK-NEXT: s_addc_u32 s97, s97, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_eq_u32 s44, 0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 77f1bc2a172a50..1d984bd49756e0 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -24,7 +24,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_max_short_forward_branch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 @@ -36,7 +36,7 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_sleep 0 ; GCN-NEXT: .LBB0_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -47,18 +47,18 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; ; GFX11-LABEL: uniform_conditional_max_short_forward_branch: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX11-NEXT: ; %bb.3: ; %bb -; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: .Lpost_getpc0: ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_add_u32 s4, s4, (.LBB0_2-.Lpost_getpc0)&4294967295 -; GFX11-NEXT: s_addc_u32 s5, s5, (.LBB0_2-.Lpost_getpc0)>>32 +; GFX11-NEXT: s_add_u32 s2, s2, (.LBB0_2-.Lpost_getpc0)&4294967295 +; GFX11-NEXT: s_addc_u32 s3, s3, (.LBB0_2-.Lpost_getpc0)>>32 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_setpc_b64 s[4:5] +; GFX11-NEXT: s_setpc_b64 s[2:3] ; GFX11-NEXT: .LBB0_1: ; %bb2 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: v_nop_e64 @@ -67,7 +67,7 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_sleep 0 ; GFX11-NEXT: .LBB0_2: ; %bb3 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -77,18 +77,18 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; ; GFX12-LABEL: uniform_conditional_max_short_forward_branch: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s0, 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX12-NEXT: ; %bb.3: ; %bb -; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_getpc_b64 s[2:3] ; GFX12-NEXT: .Lpost_getpc0: ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_u32 s4, s4, (.LBB0_2-.Lpost_getpc0)&4294967295 -; GFX12-NEXT: s_add_co_ci_u32 s5, s5, (.LBB0_2-.Lpost_getpc0)>>32 +; GFX12-NEXT: s_add_co_u32 s2, s2, (.LBB0_2-.Lpost_getpc0)&4294967295 +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, (.LBB0_2-.Lpost_getpc0)>>32 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_setpc_b64 s[4:5] +; GFX12-NEXT: s_setpc_b64 s[2:3] ; GFX12-NEXT: .LBB0_1: ; %bb2 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: v_nop_e64 @@ -97,7 +97,7 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: s_sleep 0 ; GFX12-NEXT: .LBB0_2: ; %bb3 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS @@ -124,16 +124,16 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.3: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[2:3] ; GCN-NEXT: .Lpost_getpc0: -; GCN-NEXT: s_add_u32 s4, s4, (.LBB1_2-.Lpost_getpc0)&4294967295 -; GCN-NEXT: s_addc_u32 s5, s5, (.LBB1_2-.Lpost_getpc0)>>32 -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s2, s2, (.LBB1_2-.Lpost_getpc0)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB1_2-.Lpost_getpc0)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] ; GCN-NEXT: .LBB1_1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: v_nop_e64 @@ -142,7 +142,7 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -153,18 +153,18 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; ; GFX11-LABEL: uniform_conditional_min_long_forward_branch: ; GFX11: ; %bb.0: ; %bb0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX11-NEXT: ; %bb.3: ; %bb0 -; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: .Lpost_getpc1: ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_add_u32 s4, s4, (.LBB1_2-.Lpost_getpc1)&4294967295 -; GFX11-NEXT: s_addc_u32 s5, s5, (.LBB1_2-.Lpost_getpc1)>>32 +; GFX11-NEXT: s_add_u32 s2, s2, (.LBB1_2-.Lpost_getpc1)&4294967295 +; GFX11-NEXT: s_addc_u32 s3, s3, (.LBB1_2-.Lpost_getpc1)>>32 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_setpc_b64 s[4:5] +; GFX11-NEXT: s_setpc_b64 s[2:3] ; GFX11-NEXT: .LBB1_1: ; %bb2 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: v_nop_e64 @@ -173,7 +173,7 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GFX11-NEXT: v_nop_e64 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: .LBB1_2: ; %bb3 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -183,18 +183,18 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; ; GFX12-LABEL: uniform_conditional_min_long_forward_branch: ; GFX12: ; %bb.0: ; %bb0 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s0, 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX12-NEXT: ; %bb.3: ; %bb0 -; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_getpc_b64 s[2:3] ; GFX12-NEXT: .Lpost_getpc1: ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_u32 s4, s4, (.LBB1_2-.Lpost_getpc1)&4294967295 -; GFX12-NEXT: s_add_co_ci_u32 s5, s5, (.LBB1_2-.Lpost_getpc1)>>32 +; GFX12-NEXT: s_add_co_u32 s2, s2, (.LBB1_2-.Lpost_getpc1)&4294967295 +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, (.LBB1_2-.Lpost_getpc1)>>32 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_setpc_b64 s[4:5] +; GFX12-NEXT: s_setpc_b64 s[2:3] ; GFX12-NEXT: .LBB1_1: ; %bb2 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: v_nop_e64 @@ -203,7 +203,7 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GFX12-NEXT: v_nop_e64 ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: .LBB1_2: ; %bb3 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS @@ -230,17 +230,17 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0 -; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: v_cmp_eq_f32_e64 s[2:3], s0, 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN-NEXT: s_cbranch_vccz .LBB2_1 ; GCN-NEXT: ; %bb.3: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[2:3] ; GCN-NEXT: .Lpost_getpc1: -; GCN-NEXT: s_add_u32 s4, s4, (.LBB2_2-.Lpost_getpc1)&4294967295 -; GCN-NEXT: s_addc_u32 s5, s5, (.LBB2_2-.Lpost_getpc1)>>32 -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s2, s2, (.LBB2_2-.Lpost_getpc1)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB2_2-.Lpost_getpc1)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] ; GCN-NEXT: .LBB2_1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; 32 bytes @@ -250,7 +250,7 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -261,20 +261,20 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; ; GFX11-LABEL: uniform_conditional_min_long_forward_vcnd_branch: ; GFX11: ; %bb.0: ; %bb0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0 +; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], s0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX11-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX11-NEXT: s_cbranch_vccz .LBB2_1 ; GFX11-NEXT: ; %bb.3: ; %bb0 -; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: .Lpost_getpc2: ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_add_u32 s4, s4, (.LBB2_2-.Lpost_getpc2)&4294967295 -; GFX11-NEXT: s_addc_u32 s5, s5, (.LBB2_2-.Lpost_getpc2)>>32 +; GFX11-NEXT: s_add_u32 s2, s2, (.LBB2_2-.Lpost_getpc2)&4294967295 +; GFX11-NEXT: s_addc_u32 s3, s3, (.LBB2_2-.Lpost_getpc2)>>32 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_setpc_b64 s[4:5] +; GFX11-NEXT: s_setpc_b64 s[2:3] ; GFX11-NEXT: .LBB2_1: ; %bb2 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; 32 bytes @@ -284,7 +284,7 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GFX11-NEXT: v_nop_e64 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: .LBB2_2: ; %bb3 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -294,7 +294,7 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; ; GFX12-LABEL: uniform_conditional_min_long_forward_vcnd_branch: ; GFX12: ; %bb.0: ; %bb0 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_f32 s0, 0 ; GFX12-NEXT: s_cselect_b32 s1, -1, 0 @@ -302,13 +302,13 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GFX12-NEXT: s_and_b32 vcc_lo, exec_lo, s1 ; GFX12-NEXT: s_cbranch_vccz .LBB2_1 ; GFX12-NEXT: ; %bb.3: ; %bb0 -; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_getpc_b64 s[2:3] ; GFX12-NEXT: .Lpost_getpc2: ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_u32 s4, s4, (.LBB2_2-.Lpost_getpc2)&4294967295 -; GFX12-NEXT: s_add_co_ci_u32 s5, s5, (.LBB2_2-.Lpost_getpc2)>>32 +; GFX12-NEXT: s_add_co_u32 s2, s2, (.LBB2_2-.Lpost_getpc2)&4294967295 +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, (.LBB2_2-.Lpost_getpc2)>>32 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_setpc_b64 s[4:5] +; GFX12-NEXT: s_setpc_b64 s[2:3] ; GFX12-NEXT: .LBB2_1: ; %bb2 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; 32 bytes @@ -318,7 +318,7 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GFX12-NEXT: v_nop_e64 ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: .LBB2_2: ; %bb3 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS @@ -344,7 +344,7 @@ bb3: define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: min_long_forward_vbranch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -382,7 +382,7 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; ; GFX11-LABEL: min_long_forward_vbranch: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -418,7 +418,7 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; ; GFX12-LABEL: min_long_forward_vbranch: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -573,7 +573,7 @@ bb3: define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { ; GCN-LABEL: uniform_unconditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s0, 0 ; GCN-NEXT: s_mov_b64 s[0:1], -1 @@ -588,13 +588,13 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_cbranch_vccnz .LBB5_3 ; GCN-NEXT: .LBB5_2: ; %bb2 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 17 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB5_3: ; %bb4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt expcnt(0) @@ -626,7 +626,7 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; ; GFX11-LABEL: uniform_unconditional_min_long_forward_branch: ; GFX11: ; %bb.0: ; %bb0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], -1 @@ -639,7 +639,7 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB5_3: ; %bb4 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 63 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -673,7 +673,7 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; ; GFX12-LABEL: uniform_unconditional_min_long_forward_branch: ; GFX12: ; %bb.0: ; %bb0 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s0, 0 ; GFX12-NEXT: s_mov_b32 s0, -1 @@ -694,7 +694,7 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GFX12-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: .LBB5_3: ; %bb4 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS @@ -837,7 +837,7 @@ loop: define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { ; GCN-LABEL: expand_requires_expand: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lt_i32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -875,7 +875,7 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { ; ; GFX11-LABEL: expand_requires_expand: ; GFX11: ; %bb.0: ; %bb0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lt_i32 s0, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -917,7 +917,7 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { ; ; GFX12-LABEL: expand_requires_expand: ; GFX12: ; %bb.0: ; %bb0 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lt_i32 s0, 0 ; GFX12-NEXT: s_cselect_b32 s0, -1, 0 @@ -990,7 +990,7 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GCN-LABEL: uniform_inside_divergent: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execnz .LBB8_1 ; GCN-NEXT: ; %bb.4: ; %entry ; GCN-NEXT: s_getpc_b64 s[0:1] @@ -999,13 +999,13 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_addc_u32 s1, s1, (.LBB8_3-.Lpost_getpc9)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB8_1: ; %if -; GCN-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_cbranch_scc1 .LBB8_3 ; GCN-NEXT: ; %bb.2: ; %if_uniform @@ -1013,7 +1013,7 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: .LBB8_3: ; %endif -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_sleep 5 ; GCN-NEXT: s_endpgm ; @@ -1026,11 +1026,11 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GFX11-NEXT: s_cbranch_execz .LBB8_3 ; GFX11-NEXT: ; %bb.1: ; %if ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s6, 0 ; GFX11-NEXT: global_store_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_cbranch_scc1 .LBB8_3 ; GFX11-NEXT: ; %bb.2: ; %if_uniform @@ -1044,12 +1044,12 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GFX12-LABEL: uniform_inside_divergent: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_mov_b32 s4, exec_lo +; GFX12-NEXT: s_mov_b32 s3, exec_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_gt_u32_e32 16, v0 ; GFX12-NEXT: s_cbranch_execz .LBB8_3 ; GFX12-NEXT: ; %bb.1: ; %if -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u32 s2, 0 @@ -1059,7 +1059,7 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GFX12-NEXT: v_mov_b32_e32 v1, 1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: .LBB8_3: ; %endif -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12-NEXT: s_sleep 5 ; GFX12-NEXT: s_endpgm entry: @@ -1267,13 +1267,13 @@ ret: define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 { ; GCN-LABEL: long_branch_hang: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s4, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_cmp_lt_i32 s7, 6 +; GCN-NEXT: s_cmp_lt_i32 s3, 6 ; GCN-NEXT: s_cbranch_scc1 .LBB10_1 ; GCN-NEXT: ; %bb.8: ; %bb ; GCN-NEXT: s_getpc_b64 s[8:9] @@ -1293,9 +1293,9 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: .LBB10_2: ; GCN-NEXT: s_mov_b64 s[8:9], 0 ; GCN-NEXT: .LBB10_3: ; %bb9 -; GCN-NEXT: s_cmp_lt_i32 s7, 11 +; GCN-NEXT: s_cmp_lt_i32 s3, 11 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_cmp_ge_i32 s6, s7 +; GCN-NEXT: s_cmp_ge_i32 s2, s3 ; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GCN-NEXT: s_and_b64 s[8:9], s[10:11], s[8:9] ; GCN-NEXT: .LBB10_4: ; %Flow5 @@ -1308,23 +1308,23 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_addc_u32 s1, s1, (.LBB10_6-.Lpost_getpc13)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB10_5: ; %bb14 -; GCN-NEXT: s_cmp_lt_i32 s5, 9 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_cmp_lt_i32 s6, s7 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GCN-NEXT: s_cmp_lt_i32 s1, 9 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_cmp_lt_i32 s2, s3 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: s_branch .LBB10_7 ; GCN-NEXT: .LBB10_6: ; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB10_7: ; %bb19 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xf -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 @@ -1332,13 +1332,13 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; ; GFX11-LABEL: long_branch_hang: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX11-NEXT: s_cmp_lt_i32 s7, 6 +; GFX11-NEXT: s_cmp_lt_i32 s3, 6 ; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-NEXT: ; %bb.8: ; %bb ; GFX11-NEXT: s_getpc_b64 s[8:9] @@ -1360,9 +1360,9 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX11-NEXT: .LBB10_2: ; GFX11-NEXT: s_mov_b64 s[8:9], 0 ; GFX11-NEXT: .LBB10_3: ; %bb9 -; GFX11-NEXT: s_cmp_lt_i32 s7, 11 +; GFX11-NEXT: s_cmp_lt_i32 s3, 11 ; GFX11-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX11-NEXT: s_cmp_ge_i32 s6, s7 +; GFX11-NEXT: s_cmp_ge_i32 s2, s3 ; GFX11-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b64 s[8:9], s[10:11], s[8:9] @@ -1371,13 +1371,13 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX11-NEXT: s_and_not1_b64 vcc, exec, s[8:9] ; GFX11-NEXT: s_cbranch_vccnz .LBB10_6 ; GFX11-NEXT: ; %bb.5: ; %bb14 -; GFX11-NEXT: s_cmp_lt_i32 s5, 9 -; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX11-NEXT: s_cmp_lt_i32 s6, s7 -; GFX11-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX11-NEXT: s_cmp_lt_i32 s1, 9 +; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX11-NEXT: s_cmp_lt_i32 s2, s3 +; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GFX11-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GFX11-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX11-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX11-NEXT: s_branch .LBB10_7 @@ -1385,8 +1385,8 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: .LBB10_7: ; %bb19 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x3c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 @@ -1398,15 +1398,15 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; ; GFX12-LABEL: long_branch_hang: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_mov_b32 s7, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s4, 0 -; GFX12-NEXT: s_cselect_b32 s0, -1, 0 -; GFX12-NEXT: s_cmp_lg_u32 s4, 0 -; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cmp_eq_u32 s0, 0 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_cmp_lg_u32 s0, 0 +; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_cselect_b32 s8, -1, 0 -; GFX12-NEXT: s_cmp_lt_i32 s7, 6 +; GFX12-NEXT: s_cmp_lt_i32 s3, 6 ; GFX12-NEXT: s_cbranch_scc0 .LBB10_1 ; GFX12-NEXT: ; %bb.18: ; %bb ; GFX12-NEXT: s_getpc_b64 s[10:11] @@ -1417,7 +1417,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[10:11] ; GFX12-NEXT: .LBB10_1: ; %Flow -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 ; GFX12-NEXT: s_cbranch_vccnz .LBB10_2 ; GFX12-NEXT: ; %bb.10: ; %Flow ; GFX12-NEXT: s_getpc_b64 s[8:9] @@ -1428,7 +1428,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[8:9] ; GFX12-NEXT: .LBB10_2: ; %Flow5 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_vccz .LBB10_3 ; GFX12-NEXT: ; %bb.12: ; %Flow5 ; GFX12-NEXT: s_getpc_b64 s[0:1] @@ -1439,13 +1439,13 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[0:1] ; GFX12-NEXT: .LBB10_3: ; %bb14 -; GFX12-NEXT: s_cmp_lt_i32 s5, 9 +; GFX12-NEXT: s_cmp_lt_i32 s1, 9 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_cmp_lt_i32 s2, s3 ; GFX12-NEXT: s_cselect_b32 s1, -1, 0 -; GFX12-NEXT: s_cmp_lt_i32 s6, s7 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_or_b32 s1, s4, s1 -; GFX12-NEXT: s_and_b32 s0, s0, s1 +; GFX12-NEXT: s_or_b32 s0, s1, s0 +; GFX12-NEXT: s_and_b32 s0, s6, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX12-NEXT: ; %bb.8: ; %bb14 @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[0:1] ; GFX12-NEXT: .LBB10_4: ; %bb13 -; GFX12-NEXT: s_mov_b32 s1, s8 +; GFX12-NEXT: s_mov_b32 s0, s8 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: v_nop_e64 ; GFX12-NEXT: v_nop_e64 @@ -1474,13 +1474,13 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[8:9] ; GFX12-NEXT: .LBB10_5: ; %bb9 -; GFX12-NEXT: s_cmp_lt_i32 s7, 11 -; GFX12-NEXT: s_cselect_b32 s1, -1, 0 -; GFX12-NEXT: s_cmp_ge_i32 s6, s7 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_cmp_lt_i32 s3, 11 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_cmp_ge_i32 s2, s3 +; GFX12-NEXT: s_cselect_b32 s7, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s1, s4, s1 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX12-NEXT: s_and_b32 s0, s7, s0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_vccnz .LBB10_6 ; GFX12-NEXT: ; %bb.16: ; %bb9 ; GFX12-NEXT: s_getpc_b64 s[8:9] @@ -1494,8 +1494,8 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: .LBB10_7: ; %bb19 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x3c +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index 3c48a0f0dcabe3..30c8e94c9a27f5 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -19,7 +19,7 @@ declare i48 @llvm.bswap.i48(i48) #1 define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -34,7 +34,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_bswap_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -49,7 +49,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_bswap_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -85,7 +85,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -101,7 +101,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -120,7 +120,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -144,7 +144,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -162,7 +162,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -183,7 +183,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v8i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -220,7 +220,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v4, 0x10203 ; VI-NEXT: s_mov_b32 s15, 0xf000 ; VI-NEXT: s_mov_b32 s14, -1 @@ -243,7 +243,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v8i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 @@ -270,7 +270,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -288,7 +288,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_bswap_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -304,7 +304,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_bswap_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -323,7 +323,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -347,7 +347,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -365,7 +365,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v2i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -386,7 +386,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -423,7 +423,7 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v4, 0x10203 ; VI-NEXT: s_mov_b32 s15, 0xf000 ; VI-NEXT: s_mov_b32 s14, -1 @@ -446,7 +446,7 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v4i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 6486117e014d4e..e8f1619c5d418c 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -21,7 +21,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -31,7 +31,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -41,7 +41,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -53,15 +53,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -70,7 +66,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -85,12 +81,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -99,13 +91,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start @@ -115,7 +103,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -130,13 +118,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start @@ -146,7 +130,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -161,13 +145,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start @@ -177,7 +157,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -192,13 +172,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start @@ -209,7 +185,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -233,7 +209,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -243,7 +219,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -253,7 +229,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -264,15 +240,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -280,7 +252,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -296,12 +268,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -309,12 +277,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -322,13 +286,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start @@ -337,7 +297,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -352,13 +312,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start @@ -367,7 +323,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -382,13 +338,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start @@ -398,7 +350,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -842,7 +794,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -852,7 +804,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -862,7 +814,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -874,15 +826,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -891,7 +839,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -907,13 +855,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start @@ -922,7 +866,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -937,13 +881,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start @@ -953,7 +893,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -968,13 +908,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start @@ -984,7 +920,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -999,13 +935,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start @@ -1015,7 +947,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1030,13 +962,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start @@ -1047,7 +975,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1071,7 +999,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -1081,7 +1009,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1091,7 +1019,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1102,15 +1030,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1118,7 +1042,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1134,13 +1058,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start @@ -1148,7 +1068,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, v3, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -1163,13 +1083,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start @@ -1178,7 +1094,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -1193,13 +1109,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start @@ -1208,7 +1120,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -1223,13 +1135,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start @@ -1238,7 +1146,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -1253,13 +1161,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start @@ -1269,7 +1173,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -1294,7 +1198,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1304,7 +1208,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1314,10 +1218,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start @@ -1345,15 +1249,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1362,7 +1262,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1378,13 +1278,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start @@ -1393,7 +1289,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1408,13 +1304,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start @@ -1424,7 +1316,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1439,13 +1331,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start @@ -1455,7 +1343,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1470,13 +1358,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start @@ -1486,7 +1370,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1501,13 +1385,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start @@ -1518,7 +1398,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1542,7 +1422,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1552,7 +1432,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1562,10 +1442,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start @@ -1593,15 +1473,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1610,7 +1486,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1626,13 +1502,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start @@ -1641,7 +1513,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1656,13 +1528,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start @@ -1672,7 +1540,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1687,13 +1555,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start @@ -1703,7 +1567,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1718,13 +1582,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start @@ -1734,7 +1594,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1749,13 +1609,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start @@ -1766,7 +1622,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1790,7 +1646,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1800,7 +1656,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1810,10 +1666,10 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start @@ -1841,15 +1697,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1858,7 +1710,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1874,13 +1726,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start @@ -1889,7 +1737,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1904,13 +1752,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start @@ -1920,7 +1764,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1935,13 +1779,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start @@ -1951,7 +1791,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1966,13 +1806,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start @@ -1982,7 +1818,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1997,13 +1833,9 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start @@ -2014,7 +1846,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -2043,8 +1875,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 @@ -2075,7 +1907,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2086,8 +1918,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 @@ -2118,15 +1950,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: s_add_i32 s4, s18, 0x800 +; GFX10-NEXT: s_add_i32 s4, s20, 0x800 ; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2139,7 +1967,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2154,12 +1982,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2167,15 +1991,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start @@ -2188,7 +2008,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2202,15 +2022,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start @@ -2223,7 +2039,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2237,15 +2053,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s6, s18, 0x800 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start @@ -2258,7 +2070,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_mov_b32_e32 v1, v8 ; GFX7-NEXT: v_mov_b32_e32 v2, v9 ; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2272,15 +2084,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s6, s18, 0x800 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start @@ -2294,7 +2102,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_mov_b32_e32 v1, v8 ; GFX6-NEXT: v_mov_b32_e32 v2, v9 ; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2318,8 +2126,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v2, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 @@ -2350,7 +2158,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2360,8 +2168,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v2, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 @@ -2391,15 +2199,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x800 -; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: v_mov_b32_e32 v2, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x800 ; GFX10-NEXT: v_mov_b32_e32 v6, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2409,7 +2213,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2426,12 +2230,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2439,13 +2239,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start @@ -2456,7 +2252,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v9, v4 ; GFX908-NEXT: v_mov_b32_e32 v8, v3 ; GFX908-NEXT: v_mov_b32_e32 v7, v2 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] @@ -2472,13 +2268,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start @@ -2489,7 +2281,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v9, v4 ; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] @@ -2505,13 +2297,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s6, s18, 0x800 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start @@ -2522,7 +2310,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_mov_b32_e32 v9, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] @@ -2538,13 +2326,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s6, s18, 0x800 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start @@ -2556,7 +2340,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_mov_b32_e32 v9, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] @@ -3118,8 +2902,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 @@ -3150,7 +2934,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -3161,8 +2945,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 @@ -3193,15 +2977,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: s_add_i32 s4, s18, 0x800 +; GFX10-NEXT: s_add_i32 s4, s20, 0x800 ; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3214,7 +2994,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -3229,15 +3009,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x800 +; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start @@ -3247,7 +3023,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX90A-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -3261,15 +3037,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start @@ -3282,7 +3054,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -3296,15 +3068,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start @@ -3317,7 +3085,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -3331,15 +3099,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s6, s18, 0x800 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start @@ -3352,7 +3116,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: v_mov_b32_e32 v1, v8 ; GFX7-NEXT: v_mov_b32_e32 v2, v9 ; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -3366,15 +3130,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s6, s18, 0x800 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start @@ -3388,7 +3148,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: v_mov_b32_e32 v1, v8 ; GFX6-NEXT: v_mov_b32_e32 v2, v9 ; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -3413,8 +3173,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 @@ -3445,7 +3205,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -3456,8 +3216,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 @@ -3488,15 +3248,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: s_add_i32 s4, s18, 0x800 +; GFX10-NEXT: s_add_i32 s4, s20, 0x800 ; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3509,7 +3265,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -3524,12 +3280,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3537,15 +3289,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start @@ -3558,7 +3306,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -3572,15 +3320,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start @@ -3593,7 +3337,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -3607,15 +3351,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s6, s18, 0x800 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start @@ -3628,7 +3368,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_mov_b32_e32 v1, v8 ; GFX7-NEXT: v_mov_b32_e32 v2, v9 ; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -3642,15 +3382,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s6, s18, 0x800 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start @@ -3664,7 +3400,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_mov_b32_e32 v1, v8 ; GFX6-NEXT: v_mov_b32_e32 v2, v9 ; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -3692,12 +3428,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v5, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3738,11 +3474,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: s_addk_i32 s16, 0x200 +; GFX940-NEXT: s_and_b32 s4, s16, -4 ; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s16, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -3772,11 +3508,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s16, 0x200 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_and_b32 s4, s16, -4 ; GFX11-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s4, s16, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 @@ -3814,17 +3550,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_addk_i32 s20, 0x200 +; GFX10-NEXT: s_and_b32 s4, s20, -4 ; GFX10-NEXT: v_mov_b32_e32 v5, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start @@ -3837,7 +3569,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -3854,15 +3586,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_addk_i32 s20, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 @@ -3875,7 +3603,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 ; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -3891,15 +3619,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: s_addk_i32 s20, 0x200 +; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v5, s4 -; GFX908-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 @@ -3913,7 +3637,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -3929,15 +3653,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_addk_i32 s20, 0x200 +; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 @@ -3952,7 +3672,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -3968,16 +3688,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: s_addk_i32 s20, 0x200 +; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 @@ -3995,7 +3711,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4012,16 +3728,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_addk_i32 s20, 0x200 +; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 @@ -4040,7 +3752,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4067,12 +3779,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4112,11 +3824,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: s_addk_i32 s16, 0x200 +; GFX940-NEXT: s_and_b32 s4, s16, -4 ; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s16, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -4145,11 +3857,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s16, 0x200 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_and_b32 s4, s16, -4 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s4, s16, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 @@ -4186,17 +3898,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 +; GFX10-NEXT: s_addk_i32 s20, 0x200 +; GFX10-NEXT: s_and_b32 s4, s20, -4 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start @@ -4209,7 +3917,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -4225,15 +3933,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_addk_i32 s20, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 @@ -4246,7 +3950,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 ; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -4261,15 +3965,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: s_addk_i32 s20, 0x200 +; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v3, s4 -; GFX908-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 @@ -4283,7 +3983,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -4298,15 +3998,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_addk_i32 s20, 0x200 +; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 @@ -4321,7 +4017,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -4336,16 +4032,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: s_addk_i32 s20, 0x200 +; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 @@ -4363,7 +4055,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4378,16 +4070,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_addk_i32 s20, 0x200 +; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 @@ -4406,7 +4094,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5079,13 +4767,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5134,11 +4822,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: s_addk_i32 s16, 0x200 +; GFX940-NEXT: s_and_b32 s4, s16, -4 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s16, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -5176,12 +4864,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s16, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_and_b32 s4, s16, -4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s4, s16, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 @@ -5228,18 +4916,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX10-NEXT: s_and_b32 s4, s20, -4 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start @@ -5257,7 +4941,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -5274,21 +4958,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_addk_i32 s20, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5296,13 +4976,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5318,21 +4998,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: s_addk_i32 s20, 0x200 +; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -5340,14 +5016,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5363,15 +5039,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_addk_i32 s20, 0x200 +; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 @@ -5394,7 +5066,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5410,15 +5082,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: s_addk_i32 s20, 0x200 +; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -5437,7 +5105,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5454,15 +5122,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_addk_i32 s20, 0x200 +; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -5482,7 +5146,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5509,13 +5173,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5563,11 +5227,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: s_addk_i32 s16, 0x200 +; GFX940-NEXT: s_and_b32 s4, s16, -4 ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s16, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -5604,12 +5268,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s16, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_and_b32 s4, s16, -4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s4, s16, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 @@ -5655,18 +5319,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX10-NEXT: s_and_b32 s4, s20, -4 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start @@ -5684,7 +5344,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -5700,21 +5360,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_addk_i32 s20, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5722,13 +5378,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5743,21 +5399,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: s_addk_i32 s20, 0x200 +; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -5765,14 +5417,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 +; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5787,15 +5439,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_addk_i32 s20, 0x200 +; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 @@ -5818,7 +5466,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5833,15 +5481,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: s_addk_i32 s20, 0x200 +; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -5860,7 +5504,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5875,15 +5519,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_addk_i32 s20, 0x200 +; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -5903,7 +5543,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -6633,7 +6273,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -6643,7 +6283,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6653,10 +6293,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start @@ -6684,15 +6324,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6701,7 +6337,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -6716,12 +6352,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6730,13 +6362,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start @@ -6746,7 +6374,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -6761,13 +6389,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start @@ -6779,7 +6403,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -6793,15 +6417,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6826,7 +6446,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 @@ -6843,15 +6463,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6877,7 +6493,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 @@ -6904,7 +6520,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -6914,7 +6530,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -6924,8 +6540,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 @@ -6955,15 +6571,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6971,7 +6583,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -6987,12 +6599,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7000,12 +6608,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -7013,13 +6617,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start @@ -7030,7 +6630,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -7045,15 +6645,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7078,7 +6674,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 @@ -7095,15 +6691,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7129,7 +6721,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 @@ -7656,7 +7248,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -7666,7 +7258,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -7676,10 +7268,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start @@ -7707,15 +7299,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7724,7 +7312,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -7740,13 +7328,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start @@ -7755,7 +7339,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -7770,13 +7354,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start @@ -7786,7 +7366,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -7801,13 +7381,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start @@ -7819,7 +7395,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -7833,15 +7409,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7866,7 +7438,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 @@ -7883,15 +7455,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7917,7 +7485,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 @@ -7944,7 +7512,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -7954,7 +7522,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -7964,8 +7532,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 @@ -7995,15 +7563,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8011,7 +7575,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -8027,13 +7591,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start @@ -8041,7 +7601,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -8056,13 +7616,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start @@ -8071,7 +7627,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -8086,13 +7642,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start @@ -8103,7 +7655,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -8118,15 +7670,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8151,7 +7699,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 @@ -8168,15 +7716,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8202,7 +7746,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 @@ -8229,7 +7773,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -8239,7 +7783,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -8249,10 +7793,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start @@ -8280,15 +7824,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8297,7 +7837,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -8313,13 +7853,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start @@ -8328,7 +7864,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -8343,13 +7879,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start @@ -8359,7 +7891,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -8374,13 +7906,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start @@ -8392,7 +7920,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -8406,15 +7934,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8439,7 +7963,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 @@ -8456,15 +7980,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8490,7 +8010,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 @@ -8517,7 +8037,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -8527,7 +8047,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -8537,8 +8057,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 @@ -8568,15 +8088,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8584,7 +8100,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -8600,13 +8116,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start @@ -8614,7 +8126,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -8629,13 +8141,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start @@ -8644,7 +8152,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -8659,13 +8167,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start @@ -8676,7 +8180,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 @@ -8691,15 +8195,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8724,7 +8224,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 @@ -8741,15 +8241,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8775,7 +8271,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 @@ -8806,7 +8302,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -8817,9 +8313,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -8861,8 +8357,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -8915,17 +8411,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8948,7 +8440,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -8964,18 +8456,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8989,15 +8477,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 @@ -9012,18 +8500,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9037,16 +8521,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 @@ -9061,13 +8545,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 @@ -9096,7 +8576,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 @@ -9110,13 +8590,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -9140,7 +8616,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -9156,13 +8632,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 @@ -9187,7 +8659,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -9213,7 +8685,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -9223,9 +8695,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -9267,8 +8739,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 @@ -9316,16 +8788,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9348,7 +8816,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -9364,18 +8832,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9388,15 +8852,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 @@ -9411,18 +8875,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9435,16 +8895,16 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -9459,13 +8919,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 @@ -9493,7 +8949,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -9508,13 +8964,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -9538,7 +8990,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 @@ -9554,13 +9006,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 @@ -9585,7 +9033,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 @@ -10272,7 +9720,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -10283,9 +9731,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -10327,8 +9775,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -10381,17 +9829,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -10414,7 +9858,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -10430,18 +9874,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10455,15 +9895,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 @@ -10478,18 +9918,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10503,16 +9939,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 @@ -10527,13 +9963,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 @@ -10562,7 +9994,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 @@ -10576,13 +10008,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -10606,7 +10034,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -10622,13 +10050,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 @@ -10653,7 +10077,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -10679,7 +10103,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -10689,9 +10113,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -10733,8 +10157,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 @@ -10782,16 +10206,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10814,7 +10234,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -10830,18 +10250,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10854,15 +10270,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 @@ -10877,18 +10293,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10901,16 +10313,16 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -10925,13 +10337,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 @@ -10959,7 +10367,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -10974,13 +10382,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -11004,7 +10408,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 @@ -11020,13 +10424,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 @@ -11051,7 +10451,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 @@ -11077,7 +10477,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -11088,9 +10488,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -11132,8 +10532,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -11186,17 +10586,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11219,7 +10615,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -11235,18 +10631,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11260,15 +10652,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 @@ -11283,18 +10675,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11308,16 +10696,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 @@ -11332,13 +10720,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 @@ -11367,7 +10751,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 @@ -11381,13 +10765,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -11411,7 +10791,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -11427,13 +10807,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 @@ -11458,7 +10834,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -11484,7 +10860,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -11494,9 +10870,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -11538,8 +10914,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 @@ -11587,16 +10963,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11619,7 +10991,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -11635,18 +11007,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11659,15 +11027,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 @@ -11682,18 +11050,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11706,16 +11070,16 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -11730,13 +11094,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 @@ -11764,7 +11124,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -11779,13 +11139,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -11809,7 +11165,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 @@ -11825,13 +11181,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 @@ -11856,7 +11208,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 @@ -11882,7 +11234,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -11892,9 +11244,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -11936,8 +11288,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 @@ -11985,16 +11337,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12017,7 +11365,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -12033,18 +11381,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12057,15 +11401,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 @@ -12080,18 +11424,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12104,16 +11444,16 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -12128,13 +11468,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 @@ -12162,7 +11498,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -12177,13 +11513,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -12207,7 +11539,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 @@ -12223,13 +11555,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 @@ -12254,7 +11582,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 @@ -12284,7 +11612,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN @@ -12295,7 +11623,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -12305,7 +11633,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -12317,15 +11645,11 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12334,7 +11658,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -12350,13 +11674,9 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start @@ -12366,7 +11686,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 @@ -12382,13 +11702,9 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start @@ -12398,7 +11714,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -12413,13 +11729,9 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start @@ -12429,7 +11741,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -12444,13 +11756,9 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start @@ -12460,7 +11768,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -12475,13 +11783,9 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start @@ -12492,7 +11796,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 3253fb08836537..c7511a2df9fe13 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -21,7 +21,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -32,9 +32,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -60,7 +60,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -71,13 +71,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -87,13 +83,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -104,7 +96,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -119,13 +111,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -137,7 +125,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -152,13 +140,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 @@ -170,7 +154,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -184,12 +168,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -197,12 +177,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -220,7 +196,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -230,9 +206,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -258,7 +234,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -269,13 +245,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -284,13 +256,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -300,7 +268,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -315,13 +283,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -332,7 +296,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -347,13 +311,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 @@ -364,7 +324,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -379,12 +339,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -392,12 +348,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -809,7 +761,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -820,9 +772,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -848,8 +800,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 @@ -881,15 +833,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -900,7 +848,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -916,13 +864,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -933,7 +877,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -948,13 +892,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -966,7 +906,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -981,13 +921,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 @@ -999,7 +935,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1014,13 +950,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 @@ -1032,7 +964,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1047,13 +979,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 @@ -1066,7 +994,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1090,7 +1018,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1101,9 +1029,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -1129,7 +1057,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1140,13 +1068,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1156,13 +1080,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -1173,7 +1093,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1188,13 +1108,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -1206,7 +1122,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1221,13 +1137,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 @@ -1239,7 +1151,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1253,12 +1165,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1266,12 +1174,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -1294,8 +1198,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -1329,7 +1233,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1340,8 +1244,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] @@ -1374,13 +1278,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1389,12 +1289,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1402,16 +1298,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start @@ -1425,7 +1317,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -1439,16 +1331,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start @@ -1462,7 +1350,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -1476,12 +1364,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1489,12 +1373,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -1512,9 +1392,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s16 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 @@ -1546,7 +1426,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1556,9 +1436,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_mov_b32_e32 v2, s16 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 @@ -1589,13 +1469,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1604,12 +1480,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1617,14 +1489,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start @@ -1636,7 +1504,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v9, v2 ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] @@ -1652,14 +1520,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start @@ -1671,7 +1535,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] @@ -1687,12 +1551,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1700,12 +1560,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -2156,8 +2012,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -2191,7 +2047,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2202,8 +2058,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] @@ -2237,15 +2093,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: s_add_i32 s4, s18, 0x800 +; GFX10-NEXT: s_add_i32 s4, s20, 0x800 ; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start @@ -2260,7 +2112,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2275,15 +2127,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x800 +; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX90A-NEXT: v_mov_b32_e32 v6, s6 @@ -2295,7 +2143,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX90A-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -2309,16 +2157,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start @@ -2332,7 +2176,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2346,16 +2190,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start @@ -2369,7 +2209,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2383,15 +2223,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s6, s18, 0x800 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 @@ -2406,7 +2242,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: v_mov_b32_e32 v1, v8 ; GFX7-NEXT: v_mov_b32_e32 v2, v9 ; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2420,15 +2256,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s6, s18, 0x800 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 @@ -2444,7 +2276,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: v_mov_b32_e32 v1, v8 ; GFX6-NEXT: v_mov_b32_e32 v2, v9 ; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2469,8 +2301,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -2504,7 +2336,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2515,8 +2347,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] @@ -2549,13 +2381,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2564,12 +2392,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2577,16 +2401,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start @@ -2600,7 +2420,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2614,16 +2434,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start @@ -2637,7 +2453,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2651,12 +2467,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2664,12 +2476,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -2691,13 +2499,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2739,11 +2547,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: s_addk_i32 s16, 0x200 +; GFX940-NEXT: s_and_b32 s4, s16, -4 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s16, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -2775,12 +2583,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s16, 0x200 ; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_and_b32 s4, s16, -4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s4, s16, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 @@ -2820,18 +2628,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX10-NEXT: s_and_b32 s4, s20, -4 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start @@ -2845,7 +2649,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2862,15 +2666,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_addk_i32 s20, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 @@ -2885,7 +2685,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2901,15 +2701,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: s_addk_i32 s20, 0x200 +; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 @@ -2925,7 +2721,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2941,15 +2737,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_addk_i32 s20, 0x200 +; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 @@ -2966,7 +2758,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2982,16 +2774,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: s_addk_i32 s20, 0x200 +; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 @@ -3009,7 +2797,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3026,16 +2814,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_addk_i32 s20, 0x200 +; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 @@ -3054,7 +2838,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3081,13 +2865,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3128,11 +2912,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: s_addk_i32 s16, 0x200 +; GFX940-NEXT: s_and_b32 s4, s16, -4 ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s16, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -3163,12 +2947,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s16, 0x200 ; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_and_b32 s4, s16, -4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s4, s16, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 @@ -3207,18 +2991,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX10-NEXT: s_and_b32 s4, s20, -4 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start @@ -3232,7 +3012,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -3248,15 +3028,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_addk_i32 s20, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 @@ -3271,7 +3047,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -3286,15 +3062,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: s_addk_i32 s20, 0x200 +; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 @@ -3310,7 +3082,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -3325,15 +3097,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_addk_i32 s20, 0x200 +; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 @@ -3350,7 +3118,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -3365,16 +3133,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: s_addk_i32 s20, 0x200 +; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 @@ -3392,7 +3156,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -3407,16 +3171,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_addk_i32 s20, 0x200 +; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 @@ -3435,7 +3195,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4124,13 +3884,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4179,11 +3939,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: s_addk_i32 s16, 0x200 +; GFX940-NEXT: s_and_b32 s4, s16, -4 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s16, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -4221,12 +3981,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s16, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_and_b32 s4, s16, -4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s4, s16, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 @@ -4273,18 +4033,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX10-NEXT: s_and_b32 s4, s20, -4 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start @@ -4302,7 +4058,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -4319,21 +4075,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_addk_i32 s20, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4341,13 +4093,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4363,21 +4115,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: s_addk_i32 s20, 0x200 +; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -4385,14 +4133,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4408,15 +4156,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_addk_i32 s20, 0x200 +; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 @@ -4439,7 +4183,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4455,15 +4199,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: s_addk_i32 s20, 0x200 +; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4483,7 +4223,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4500,15 +4240,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_addk_i32 s20, 0x200 +; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4529,7 +4265,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4556,13 +4292,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4610,11 +4346,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: s_addk_i32 s16, 0x200 +; GFX940-NEXT: s_and_b32 s4, s16, -4 ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s16, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -4651,12 +4387,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s16, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_and_b32 s4, s16, -4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s4, s16, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 @@ -4702,18 +4438,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX10-NEXT: s_and_b32 s4, s20, -4 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start @@ -4731,7 +4463,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -4747,21 +4479,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_addk_i32 s20, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4769,13 +4497,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4790,21 +4518,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: s_addk_i32 s20, 0x200 +; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -4812,14 +4536,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 +; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4834,15 +4558,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_addk_i32 s20, 0x200 +; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 @@ -4865,7 +4585,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4880,15 +4600,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: s_addk_i32 s20, 0x200 +; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4908,7 +4624,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4923,15 +4639,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_addk_i32 s20, 0x200 +; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4952,7 +4664,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5684,8 +5396,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5720,9 +5432,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -5749,8 +5461,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 @@ -5783,15 +5495,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5802,7 +5510,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -5818,13 +5526,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -5835,7 +5539,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX90A-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -5850,13 +5554,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -5868,7 +5568,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -5883,13 +5583,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 @@ -5905,7 +5601,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 @@ -5919,15 +5615,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5952,7 +5644,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 @@ -5969,15 +5661,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6003,7 +5691,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 @@ -6030,8 +5718,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 @@ -6063,9 +5751,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -6092,8 +5780,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 @@ -6123,16 +5811,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6141,7 +5825,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -6157,13 +5841,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -6173,7 +5853,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -6188,13 +5868,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -6205,7 +5881,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -6220,13 +5896,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 @@ -6241,7 +5913,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -6256,15 +5928,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6289,7 +5957,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 @@ -6306,15 +5974,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6340,7 +6004,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 @@ -6988,8 +6652,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 @@ -7040,9 +6704,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -7084,8 +6748,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -7138,17 +6802,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7171,7 +6831,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -7187,18 +6847,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7212,15 +6868,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 @@ -7235,18 +6891,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7260,16 +6912,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 @@ -7284,13 +6936,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 @@ -7319,7 +6967,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 @@ -7333,13 +6981,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -7363,7 +7007,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -7379,13 +7023,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 @@ -7410,7 +7050,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -7436,8 +7076,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 @@ -7483,9 +7123,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -7527,8 +7167,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 @@ -7576,16 +7216,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7608,7 +7244,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -7624,18 +7260,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7648,15 +7280,15 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 @@ -7671,18 +7303,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7695,16 +7323,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -7719,13 +7347,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 @@ -7753,7 +7377,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -7768,13 +7392,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -7798,7 +7418,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 @@ -7814,13 +7434,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 @@ -7845,7 +7461,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 @@ -8593,7 +8209,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN @@ -8605,9 +8221,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -8633,7 +8249,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -8644,13 +8260,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -8660,13 +8272,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -8678,7 +8286,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 @@ -8694,13 +8302,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -8712,7 +8316,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -8727,13 +8331,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 @@ -8745,7 +8345,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -8759,12 +8359,8 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8772,12 +8368,8 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 6ce2f350257c8e..0bcaacc6b08e8a 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -21,7 +21,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -32,9 +32,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -60,7 +60,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -71,13 +71,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -87,13 +83,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -104,7 +96,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -119,13 +111,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -137,7 +125,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -152,13 +140,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 @@ -170,7 +154,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -184,12 +168,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -197,12 +177,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -220,7 +196,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -230,9 +206,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -258,7 +234,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -269,13 +245,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -284,13 +256,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -300,7 +268,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -315,13 +283,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -332,7 +296,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -347,13 +311,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 @@ -364,7 +324,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -379,12 +339,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -392,12 +348,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -809,7 +761,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -820,9 +772,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -848,8 +800,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 @@ -881,15 +833,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -900,7 +848,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -916,13 +864,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -933,7 +877,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -948,13 +892,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -966,7 +906,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -981,13 +921,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 @@ -999,7 +935,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1014,13 +950,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX7-NEXT: v_mov_b32_e32 v3, s6 @@ -1032,7 +964,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1047,13 +979,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 @@ -1066,7 +994,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1090,7 +1018,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1101,9 +1029,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -1129,7 +1057,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1140,13 +1068,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1156,13 +1080,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -1173,7 +1093,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1188,13 +1108,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -1206,7 +1122,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1221,13 +1137,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 @@ -1239,7 +1151,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1253,12 +1165,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1266,12 +1174,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -1294,8 +1198,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -1329,7 +1233,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1340,8 +1244,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] @@ -1374,13 +1278,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1389,12 +1289,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1402,16 +1298,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start @@ -1425,7 +1317,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -1439,16 +1331,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start @@ -1462,7 +1350,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -1476,12 +1364,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1489,12 +1373,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -1512,9 +1392,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s16 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 @@ -1546,7 +1426,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1556,9 +1436,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_mov_b32_e32 v2, s16 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 @@ -1589,13 +1469,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1604,12 +1480,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1617,14 +1489,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start @@ -1636,7 +1504,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v9, v2 ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] @@ -1652,14 +1520,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start @@ -1671,7 +1535,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v9, v2 ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] @@ -1687,12 +1551,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1700,12 +1560,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -2156,8 +2012,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -2191,7 +2047,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2202,8 +2058,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] @@ -2237,15 +2093,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: s_add_i32 s4, s18, 0x800 +; GFX10-NEXT: s_add_i32 s4, s20, 0x800 ; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start @@ -2260,7 +2112,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2275,15 +2127,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x800 +; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX90A-NEXT: v_mov_b32_e32 v6, s6 @@ -2295,7 +2143,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX90A-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -2309,16 +2157,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start @@ -2332,7 +2176,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2346,16 +2190,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start @@ -2369,7 +2209,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2383,15 +2223,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v0, s20 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s6, s18, 0x800 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 @@ -2406,7 +2242,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: v_mov_b32_e32 v1, v8 ; GFX7-NEXT: v_mov_b32_e32 v2, v9 ; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2420,15 +2256,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s6, s18, 0x800 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mov_b32_e32 v6, s6 @@ -2444,7 +2276,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: v_mov_b32_e32 v1, v8 ; GFX6-NEXT: v_mov_b32_e32 v2, v9 ; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2469,8 +2301,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -2504,7 +2336,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s16 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2515,8 +2347,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] @@ -2549,13 +2381,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2564,12 +2392,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s20 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2577,16 +2401,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 +; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start @@ -2600,7 +2420,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2614,16 +2434,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 +; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start @@ -2637,7 +2453,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] @@ -2651,12 +2467,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2664,12 +2476,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -2691,13 +2499,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2739,11 +2547,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: s_addk_i32 s16, 0x200 +; GFX940-NEXT: s_and_b32 s4, s16, -4 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s16, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -2775,12 +2583,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s16, 0x200 ; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_and_b32 s4, s16, -4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s4, s16, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 @@ -2820,18 +2628,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX10-NEXT: s_and_b32 s4, s20, -4 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start @@ -2845,7 +2649,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2862,15 +2666,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_addk_i32 s20, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 @@ -2885,7 +2685,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2901,15 +2701,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: s_addk_i32 s20, 0x200 +; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 @@ -2925,7 +2721,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2941,15 +2737,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_addk_i32 s20, 0x200 +; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 @@ -2966,7 +2758,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2982,16 +2774,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: s_addk_i32 s20, 0x200 +; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 @@ -3009,7 +2797,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3026,16 +2814,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_addk_i32 s20, 0x200 +; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 @@ -3054,7 +2838,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3081,13 +2865,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3128,11 +2912,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: s_addk_i32 s16, 0x200 +; GFX940-NEXT: s_and_b32 s4, s16, -4 ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s16, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -3163,12 +2947,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s16, 0x200 ; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_and_b32 s4, s16, -4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s4, s16, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 @@ -3207,18 +2991,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX10-NEXT: s_and_b32 s4, s20, -4 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start @@ -3232,7 +3012,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -3248,15 +3028,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_addk_i32 s20, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 @@ -3271,7 +3047,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -3286,15 +3062,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: s_addk_i32 s20, 0x200 +; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 @@ -3310,7 +3082,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -3325,15 +3097,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_addk_i32 s20, 0x200 +; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 @@ -3350,7 +3118,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -3365,16 +3133,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: s_addk_i32 s20, 0x200 +; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 @@ -3392,7 +3156,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -3407,16 +3171,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_addk_i32 s20, 0x200 +; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 @@ -3435,7 +3195,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4124,13 +3884,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4179,11 +3939,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: s_addk_i32 s16, 0x200 +; GFX940-NEXT: s_and_b32 s4, s16, -4 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s16, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -4221,12 +3981,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s16, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_and_b32 s4, s16, -4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s4, s16, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 @@ -4273,18 +4033,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX10-NEXT: s_and_b32 s4, s20, -4 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start @@ -4302,7 +4058,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -4319,21 +4075,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_addk_i32 s20, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4341,13 +4093,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4363,21 +4115,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: s_addk_i32 s20, 0x200 +; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -4385,14 +4133,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4408,15 +4156,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_addk_i32 s20, 0x200 +; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 @@ -4439,7 +4183,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4455,15 +4199,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: s_addk_i32 s20, 0x200 +; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4483,7 +4223,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4500,15 +4240,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_addk_i32 s20, 0x200 +; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4529,7 +4265,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4556,13 +4292,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s6, -4 +; GFX12-NEXT: s_and_b32 s4, s16, -4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s4, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4610,11 +4346,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 +; GFX940-NEXT: s_addk_i32 s16, 0x200 +; GFX940-NEXT: s_and_b32 s4, s16, -4 ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s16, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -4651,12 +4387,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s16, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 +; GFX11-NEXT: s_and_b32 s4, s16, -4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s4, s16, 3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 @@ -4702,18 +4438,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX10-NEXT: s_and_b32 s4, s20, -4 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s4, s20, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX10-NEXT: s_not_b32 s6, s5 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start @@ -4731,7 +4463,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -4747,21 +4479,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_addk_i32 s20, 0x200 +; GFX90A-NEXT: s_and_b32 s4, s20, -4 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s4, s20, 3 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX90A-NEXT: s_not_b32 s7, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4769,13 +4497,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4790,21 +4518,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 +; GFX908-NEXT: s_addk_i32 s20, 0x200 +; GFX908-NEXT: s_and_b32 s4, s20, -4 ; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 +; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s4, s20, 3 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX908-NEXT: s_not_b32 s7, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -4812,14 +4536,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 +; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4834,15 +4558,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_addk_i32 s20, 0x200 +; GFX8-NEXT: s_and_b32 s4, s20, -4 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 +; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s4, s20, 3 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_not_b32 s7, s4 @@ -4865,7 +4585,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4880,15 +4600,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 +; GFX7-NEXT: s_addk_i32 s20, 0x200 +; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 +; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4908,7 +4624,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4923,15 +4639,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_addk_i32 s20, 0x200 +; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 +; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4952,7 +4664,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5684,8 +5396,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5720,9 +5432,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -5749,8 +5461,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 @@ -5783,15 +5495,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5802,7 +5510,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -5818,13 +5526,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -5835,7 +5539,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX90A-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -5850,13 +5554,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -5868,7 +5568,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -5883,13 +5583,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 @@ -5905,7 +5601,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 @@ -5919,15 +5615,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5952,7 +5644,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 @@ -5969,15 +5661,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6003,7 +5691,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 @@ -6030,8 +5718,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v3, s4 @@ -6063,9 +5751,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -6092,8 +5780,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 @@ -6123,16 +5811,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6141,7 +5825,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -6157,13 +5841,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -6173,7 +5853,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -6188,13 +5868,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -6205,7 +5881,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -6220,13 +5896,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 @@ -6241,7 +5913,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -6256,15 +5928,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6289,7 +5957,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 @@ -6306,15 +5974,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6340,7 +6004,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 @@ -6988,8 +6652,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v4, s4 @@ -7040,9 +6704,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -7084,8 +6748,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -7138,17 +6802,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7171,7 +6831,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -7187,18 +6847,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7212,15 +6868,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 +; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 @@ -7235,18 +6891,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7260,16 +6912,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 +; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 +; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 @@ -7284,13 +6936,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 @@ -7319,7 +6967,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 @@ -7333,13 +6981,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -7363,7 +7007,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -7379,13 +7023,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 @@ -7410,7 +7050,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -7436,8 +7076,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 @@ -7483,9 +7123,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s16 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_add_i32 s4, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -7527,8 +7167,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_add_i32 s4, s16, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 @@ -7576,16 +7216,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 +; GFX10-NEXT: s_add_i32 s4, s20, 0x400 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7608,7 +7244,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -7624,18 +7260,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s12, 0x7fff +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7648,15 +7280,15 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 @@ -7671,18 +7303,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s4, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s12, 0x7fff +; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s13, 0x7060302 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: v_mov_b32_e32 v4, s4 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7695,16 +7323,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -7719,13 +7347,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 @@ -7753,7 +7377,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 @@ -7768,13 +7392,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -7798,7 +7418,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 @@ -7814,13 +7434,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 @@ -7845,7 +7461,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 @@ -8593,7 +8209,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN @@ -8605,9 +8221,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -8633,7 +8249,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -8644,13 +8260,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -8660,13 +8272,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -8678,7 +8286,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 @@ -8694,13 +8302,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, s6 @@ -8712,7 +8316,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -8727,13 +8331,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 @@ -8745,7 +8345,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -8759,12 +8359,8 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -8772,12 +8368,8 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll index 08a997530d3c94..ce55e9171c8180 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) { ; GISEL-LABEL: buffer_ptr_vector_ops: ; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GISEL-NEXT: v_mov_b32_e32 v8, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -25,7 +25,7 @@ define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) { ; ; SDAG-LABEL: buffer_ptr_vector_ops: ; SDAG: ; %bb.0: ; %main_body -; SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -60,44 +60,44 @@ main_body: define amdgpu_kernel void @buffer_structs(%fat_buffer_struct %arg, ptr addrspace(1) %dest) { ; GISEL-LABEL: buffer_structs: ; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GISEL-NEXT: s_load_dword s6, s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GISEL-NEXT: v_mov_b32_e32 v5, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_ashr_i32 s1, s0, 31 -; GISEL-NEXT: v_mov_b32_e32 v4, s0 -; GISEL-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 -; GISEL-NEXT: s_add_u32 s0, s8, s0 -; GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-NEXT: s_addc_u32 s1, s9, s1 -; GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-NEXT: v_mov_b32_e32 v2, s6 -; GISEL-NEXT: v_mov_b32_e32 v3, s7 -; GISEL-NEXT: buffer_store_dword v4, v4, s[4:7], 0 offen -; GISEL-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dword v5, v4, s[0:1] offset:16 +; GISEL-NEXT: s_ashr_i32 s7, s6, 31 +; GISEL-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 +; GISEL-NEXT: s_add_u32 s4, s8, s4 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v4, s6 +; GISEL-NEXT: s_addc_u32 s5, s9, s5 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: buffer_store_dword v4, v4, s[0:3], 0 offen +; GISEL-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] +; GISEL-NEXT: global_store_dword v5, v4, s[4:5] offset:16 ; GISEL-NEXT: s_endpgm ; ; SDAG-LABEL: buffer_structs: ; SDAG: ; %bb.0: ; %main_body -; SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; SDAG-NEXT: s_load_dword s6, s[4:5], 0x34 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: s_ashr_i32 s1, s0, 31 +; SDAG-NEXT: s_ashr_i32 s7, s6, 31 +; SDAG-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 +; SDAG-NEXT: s_add_u32 s4, s8, s4 +; SDAG-NEXT: v_mov_b32_e32 v0, s6 +; SDAG-NEXT: s_addc_u32 s5, s9, s5 +; SDAG-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; SDAG-NEXT: global_store_dword v4, v0, s[4:5] offset:16 ; SDAG-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 -; SDAG-NEXT: s_add_u32 s0, s8, s0 -; SDAG-NEXT: s_addc_u32 s1, s9, s1 -; SDAG-NEXT: buffer_store_dword v0, v0, s[4:7], 0 offen -; SDAG-NEXT: global_store_dword v4, v0, s[0:1] offset:16 -; SDAG-NEXT: v_mov_b32_e32 v0, s4 -; SDAG-NEXT: v_mov_b32_e32 v1, s5 -; SDAG-NEXT: v_mov_b32_e32 v2, s6 -; SDAG-NEXT: v_mov_b32_e32 v3, s7 -; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; SDAG-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-NEXT: v_mov_b32_e32 v3, s3 +; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm main_body: %buffer = extractvalue %fat_buffer_struct %arg, 0 diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index dc9ce68a4a6836..4ab940288e8c86 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector2: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 5 @@ -19,7 +19,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -30,7 +30,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector2: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, 6 @@ -40,7 +40,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector2: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, 6 @@ -50,7 +50,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector2: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, 6 @@ -65,7 +65,7 @@ entry: define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector4: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 5 @@ -78,7 +78,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector4: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 6 ; GFX8-NEXT: v_mov_b32_e32 v2, 7 @@ -91,7 +91,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector4: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, 6 @@ -103,7 +103,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector4: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, 6 @@ -115,7 +115,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector4: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, 6 @@ -132,7 +132,7 @@ entry: define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector_v2i16: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x60005 @@ -142,7 +142,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector_v2i16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x60005 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector_v2i16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -161,7 +161,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector_v2i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -170,7 +170,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector_v2i16: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -184,21 +184,21 @@ entry: define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: build_vector_v2i16_trunc: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_alignbit_b32 v0, 5, s4, 16 +; GFX6-NEXT: v_alignbit_b32 v0, 5, s6, 16 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: build_vector_v2i16_trunc: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s2, s4, 16 +; GFX8-NEXT: s_lshr_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s2, 0x50000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -209,11 +209,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX10-LABEL: build_vector_v2i16_trunc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s2, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -222,11 +222,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX11-LABEL: build_vector_v2i16_trunc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_pack_hl_b32_b16 s2, s4, 5 +; GFX11-NEXT: s_pack_hl_b32_b16 s2, s2, 5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -234,11 +234,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; ; GFX940-LABEL: build_vector_v2i16_trunc: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshr_b32 s2, s4, 16 +; GFX940-NEXT: s_lshr_b32 s2, s2, 16 ; GFX940-NEXT: s_pack_ll_b32_b16 s2, s2, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 @@ -254,7 +254,7 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) { ; GFX6-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -269,7 +269,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX8-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -282,7 +282,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 @@ -294,7 +294,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX11-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 @@ -306,14 +306,14 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshl_b32 s0, s7, 16 -; GFX940-NEXT: s_lshl_b32 s1, s6, 16 -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 +; GFX940-NEXT: s_lshl_b32 s3, s3, 16 +; GFX940-NEXT: s_lshl_b32 s2, s2, 16 +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm entry: %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll index e8898d6a7001cc..ff8a490950a11e 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll @@ -12,7 +12,7 @@ ; ALL-NEXT: .amdhsa_next_free_sgpr (max(kernel.numbered_sgpr+(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)) ; GFX90A-NEXT: .amdhsa_accum_offset ((((((alignto(max(1, kernel.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4 -; ALL: .set kernel.num_vgpr, max(32, aliasee_default.num_vgpr) +; ALL: .set kernel.num_vgpr, max(41, aliasee_default.num_vgpr) ; ALL-NEXT: .set kernel.num_agpr, max(0, aliasee_default.num_agpr) ; ALL-NEXT: .set kernel.numbered_sgpr, max(33, aliasee_default.numbered_sgpr) define amdgpu_kernel void @kernel() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll index a01268625cedbd..fdd37bb299807d 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll @@ -7,7 +7,7 @@ @alias0 = hidden alias void (), ptr @aliasee_default_vgpr64_sgpr102 ; CHECK-LABEL: {{^}}kernel0: -; CHECK: .set kernel0.num_vgpr, max(32, aliasee_default_vgpr64_sgpr102.num_vgpr) +; CHECK: .set kernel0.num_vgpr, max(41, aliasee_default_vgpr64_sgpr102.num_vgpr) ; CHECK-NEXT: .set kernel0.num_agpr, max(0, aliasee_default_vgpr64_sgpr102.num_agpr) ; CHECK-NEXT: .set kernel0.numbered_sgpr, max(33, aliasee_default_vgpr64_sgpr102.numbered_sgpr) define amdgpu_kernel void @kernel0() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll index 86defe3ba7ec08..3b08960d164a69 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll @@ -12,7 +12,7 @@ ; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel1.num_agpr, kernel1.num_vgpr), 1, 0) ; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel1.numbered_sgpr+(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)) -; CHECK: .set kernel1.num_vgpr, max(41, aliasee_vgpr32_sgpr76.num_vgpr) +; CHECK: .set kernel1.num_vgpr, max(42, aliasee_vgpr32_sgpr76.num_vgpr) ; CHECK-NEXT: .set kernel1.num_agpr, max(0, aliasee_vgpr32_sgpr76.num_agpr) ; CHECK-NEXT: .set kernel1.numbered_sgpr, max(33, aliasee_vgpr32_sgpr76.numbered_sgpr) define amdgpu_kernel void @kernel1() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll index 6b1fbd9b6e16a2..b044e0a7167992 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll @@ -10,7 +10,7 @@ ; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel2.num_agpr, kernel2.num_vgpr), 1, 0) ; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel2.numbered_sgpr+(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)) -; CHECK: .set kernel2.num_vgpr, max(32, aliasee_vgpr64_sgpr102.num_vgpr) +; CHECK: .set kernel2.num_vgpr, max(41, aliasee_vgpr64_sgpr102.num_vgpr) ; CHECK-NEXT: .set kernel2.num_agpr, max(0, aliasee_vgpr64_sgpr102.num_agpr) ; CHECK-NEXT: .set kernel2.numbered_sgpr, max(33, aliasee_vgpr64_sgpr102.numbered_sgpr) define amdgpu_kernel void @kernel2() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll index c81181cd826677..264cc4bd190f97 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll @@ -10,7 +10,7 @@ ; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel3.num_agpr, kernel3.num_vgpr), 1, 0) ; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel3.numbered_sgpr+(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)) -; CHECK: .set kernel3.num_vgpr, max(32, aliasee_vgpr256_sgpr102.num_vgpr) +; CHECK: .set kernel3.num_vgpr, max(41, aliasee_vgpr256_sgpr102.num_vgpr) ; CHECK-NEXT: .set kernel3.num_agpr, max(0, aliasee_vgpr256_sgpr102.num_agpr) ; CHECK-NEXT: .set kernel3.numbered_sgpr, max(33, aliasee_vgpr256_sgpr102.numbered_sgpr) define amdgpu_kernel void @kernel3() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll index d35b5fe818bef8..93a4469c7718ea 100644 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll @@ -35,21 +35,20 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_i8_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s17, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_i8_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_i8_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i8_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i8_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -96,21 +95,20 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s17, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_i16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_i16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -157,21 +155,20 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s17, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -218,21 +215,21 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_i64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_i64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -279,21 +276,21 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_v2i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_v2i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -340,22 +337,22 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_v3i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s17, s33 +; GFX9-NEXT: s_mov_b32 s19, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s17, 2 +; GFX9-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-NEXT: v_writelane_b32 v40, s19, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s2, s16 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s2, s18 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v3i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX9-NEXT: s_getpc_b64 s[20:21] +; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v3i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -402,23 +399,23 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_v4i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s20, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-NEXT: v_writelane_b32 v40, s18, 2 +; GFX9-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-NEXT: v_writelane_b32 v40, s20, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s3, s17 -; GFX9-NEXT: s_mov_b32 s2, s16 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s3, s19 +; GFX9-NEXT: s_mov_b32 s2, s18 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v4i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX9-NEXT: s_getpc_b64 s[20:21] +; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v4i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -465,27 +462,27 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_v8i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s22, s33 +; GFX9-NEXT: s_mov_b32 s24, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[24:25], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[26:27], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[24:25] -; GFX9-NEXT: v_writelane_b32 v40, s22, 2 +; GFX9-NEXT: s_mov_b64 exec, s[26:27] +; GFX9-NEXT: v_writelane_b32 v40, s24, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s3, s17 -; GFX9-NEXT: s_mov_b32 s2, s16 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 -; GFX9-NEXT: s_mov_b32 s16, s18 -; GFX9-NEXT: s_mov_b32 s17, s19 -; GFX9-NEXT: s_mov_b32 s18, s20 -; GFX9-NEXT: s_mov_b32 s19, s21 +; GFX9-NEXT: s_mov_b32 s3, s19 +; GFX9-NEXT: s_mov_b32 s2, s18 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: s_mov_b32 s16, s20 +; GFX9-NEXT: s_mov_b32 s17, s21 +; GFX9-NEXT: s_mov_b32 s18, s22 +; GFX9-NEXT: s_mov_b32 s19, s23 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[22:23] -; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_v8i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23] +; GFX9-NEXT: s_getpc_b64 s[24:25] +; GFX9-NEXT: s_add_u32 s24, s24, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s25, s25, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[24:25] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -500,21 +497,17 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 { ; GFX11-LABEL: test_call_external_void_func_v8i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s18, s33 +; GFX11-NEXT: s_mov_b32 s20, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s19, -1 +; GFX11-NEXT: s_or_saveexec_b32 s21, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s19 -; GFX11-NEXT: v_writelane_b32 v40, s18, 2 -; GFX11-NEXT: s_mov_b32 s19, s17 -; GFX11-NEXT: s_mov_b32 s18, s16 -; GFX11-NEXT: s_mov_b32 s17, s7 -; GFX11-NEXT: s_mov_b32 s16, s6 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_mov_b32 exec_lo, s21 +; GFX11-NEXT: v_writelane_b32 v40, s20, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[20:21] ; GFX11-NEXT: s_add_u32 s20, s20, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s21, s21, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -536,21 +529,20 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s17, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_f16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_f16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -597,21 +589,20 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s17, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_bf16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_bf16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_bf16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_bf16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -658,21 +649,20 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s17, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_f32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_f32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -719,21 +709,21 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_f64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_f64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -780,21 +770,20 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 ; GFX9-LABEL: test_call_external_void_func_v2f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s17, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_v2f16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_v2f16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2f16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2f16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -842,21 +831,20 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) ; GFX9-LABEL: test_call_external_void_func_v2bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s17, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_v2bf16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_v2bf16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2bf16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2bf16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -903,21 +891,21 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 ; GFX9-LABEL: test_call_external_void_func_v3f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_v3f16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v3f16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -964,21 +952,21 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 ; GFX9-LABEL: test_call_external_void_func_v4f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_v4f16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_v4f16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v4f16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v4f16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -1025,21 +1013,21 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 { ; GFX9-LABEL: test_call_external_void_func_p0_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_p0_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_p0_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p0_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p0_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -1086,21 +1074,21 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) ; GFX9-LABEL: test_call_external_void_func_p1_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_p1_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_p1_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p1_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p1_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -1147,21 +1135,20 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) ; GFX9-LABEL: test_call_external_void_func_p3_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s17, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_p3_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_p3_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p3_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p3_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -1208,23 +1195,23 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre ; GFX9-LABEL: test_call_external_void_func_v2p1_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s20, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-NEXT: v_writelane_b32 v40, s18, 2 +; GFX9-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-NEXT: v_writelane_b32 v40, s20, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s3, s17 -; GFX9-NEXT: s_mov_b32 s2, s16 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s3, s19 +; GFX9-NEXT: s_mov_b32 s2, s18 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2p1_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2p1_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX9-NEXT: s_getpc_b64 s[20:21] +; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v2p1_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v2p1_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -1271,21 +1258,21 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre ; GFX9-LABEL: test_call_external_void_func_v2p5_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s18, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 +; GFX9-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, external_void_func_v2p5_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, external_void_func_v2p5_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2p5_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2p5_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -1332,24 +1319,24 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre ; GFX9-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s19, s33 +; GFX9-NEXT: s_mov_b32 s21, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-NEXT: v_writelane_b32 v40, s19, 2 +; GFX9-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-NEXT: v_writelane_b32 v40, s21, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s3, s17 -; GFX9-NEXT: s_mov_b32 s2, s16 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 -; GFX9-NEXT: s_mov_b32 s16, s18 +; GFX9-NEXT: s_mov_b32 s3, s19 +; GFX9-NEXT: s_mov_b32 s2, s18 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: s_mov_b32 s16, s20 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[20:21] -; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] +; GFX9-NEXT: s_getpc_b64 s[22:23] +; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -1364,14 +1351,12 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre ; GFX11-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s16, s33 +; GFX11-NEXT: s_mov_b32 s17, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s17, -1 +; GFX11-NEXT: s_or_saveexec_b32 s18, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s17 -; GFX11-NEXT: v_writelane_b32 v40, s16, 2 -; GFX11-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7 -; GFX11-NEXT: s_mov_b32 s16, s6 +; GFX11-NEXT: s_mov_b32 exec_lo, s18 +; GFX11-NEXT: v_writelane_b32 v40, s17, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[18:19] ; GFX11-NEXT: s_add_u32 s18, s18, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@lo+4 @@ -1398,32 +1383,32 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; GFX9-LABEL: test_call_external_void_func_a15i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s27, s33 +; GFX9-NEXT: s_mov_b32 s29, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GFX9-NEXT: s_or_saveexec_b64 vcc, -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[28:29] -; GFX9-NEXT: v_writelane_b32 v40, s27, 2 +; GFX9-NEXT: s_mov_b64 exec, vcc +; GFX9-NEXT: v_writelane_b32 v40, s29, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s3, s17 -; GFX9-NEXT: s_mov_b32 s2, s16 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 -; GFX9-NEXT: s_mov_b32 s16, s18 -; GFX9-NEXT: s_mov_b32 s17, s19 -; GFX9-NEXT: s_mov_b32 s18, s20 -; GFX9-NEXT: s_mov_b32 s19, s21 -; GFX9-NEXT: s_mov_b32 s20, s22 -; GFX9-NEXT: s_mov_b32 s21, s23 -; GFX9-NEXT: s_mov_b32 s22, s24 -; GFX9-NEXT: s_mov_b32 s23, s25 -; GFX9-NEXT: s_mov_b32 s24, s26 +; GFX9-NEXT: s_mov_b32 s3, s19 +; GFX9-NEXT: s_mov_b32 s2, s18 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: s_mov_b32 s16, s20 +; GFX9-NEXT: s_mov_b32 s17, s21 +; GFX9-NEXT: s_mov_b32 s18, s22 +; GFX9-NEXT: s_mov_b32 s19, s23 +; GFX9-NEXT: s_mov_b32 s20, s24 +; GFX9-NEXT: s_mov_b32 s21, s25 +; GFX9-NEXT: s_mov_b32 s22, s26 +; GFX9-NEXT: s_mov_b32 s23, s27 +; GFX9-NEXT: s_mov_b32 s24, s28 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[28:29] -; GFX9-NEXT: s_add_u32 s28, s28, external_void_func_a15i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s29, s29, external_void_func_a15i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[28:29] +; GFX9-NEXT: s_getpc_b64 vcc +; GFX9-NEXT: s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 vcc_hi, vcc_hi, external_void_func_a15i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], vcc ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -1438,29 +1423,20 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; GFX11-LABEL: test_call_external_void_func_a15i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s23, s33 +; GFX11-NEXT: s_mov_b32 s25, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s24, -1 +; GFX11-NEXT: s_or_saveexec_b32 s26, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s24 -; GFX11-NEXT: v_writelane_b32 v40, s23, 2 -; GFX11-NEXT: s_mov_b32 s24, s22 -; GFX11-NEXT: s_mov_b32 s23, s21 -; GFX11-NEXT: s_mov_b32 s22, s20 -; GFX11-NEXT: s_mov_b32 s21, s19 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s20, s18 -; GFX11-NEXT: s_mov_b32 s19, s17 -; GFX11-NEXT: s_mov_b32 s18, s16 -; GFX11-NEXT: s_mov_b32 s17, s7 -; GFX11-NEXT: s_mov_b32 s16, s6 +; GFX11-NEXT: s_mov_b32 exec_lo, s26 +; GFX11-NEXT: v_writelane_b32 v40, s25, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[26:27] ; GFX11-NEXT: s_add_u32 s26, s26, external_void_func_a15i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s27, s27, external_void_func_a15i32_inreg@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[26:27] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 725c2d71ac5e35..26ab0f3ce63559 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -67,8 +67,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -85,8 +86,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 @@ -103,8 +105,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -118,27 +121,29 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; GFX11-LABEL: test_call_external_void_func_i1_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_i1_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_i1(i1 true) ret void @@ -147,16 +152,17 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; VI-LABEL: test_call_external_void_func_i1_signext: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s38, -1 -; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -169,16 +175,17 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; ; CI-LABEL: test_call_external_void_func_i1_signext: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; CI-NEXT: s_mov_b32 s38, -1 -; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -191,16 +198,17 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; ; GFX9-LABEL: test_call_external_void_func_i1_signext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -215,33 +223,35 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_signext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_signext@rel32@hi+12 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_i1_signext: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1_signext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1_signext@rel32@hi+12 ; HSA-NEXT: v_bfe_i32 v0, v0, 0, 1 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %var = load volatile i1, ptr addrspace(1) undef call void @external_void_func_i1_signext(i1 signext %var) @@ -252,16 +262,17 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; VI-LABEL: test_call_external_void_func_i1_zeroext: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s38, -1 -; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -274,16 +285,17 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; ; CI-LABEL: test_call_external_void_func_i1_zeroext: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; CI-NEXT: s_mov_b32 s38, -1 -; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -296,16 +308,17 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; ; GFX9-LABEL: test_call_external_void_func_i1_zeroext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -320,33 +333,35 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_zeroext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_zeroext@rel32@hi+12 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_i1_zeroext: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1_zeroext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1_zeroext@rel32@hi+12 ; HSA-NEXT: v_and_b32_e32 v0, 1, v0 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %var = load volatile i1, ptr addrspace(1) undef call void @external_void_func_i1_zeroext(i1 zeroext %var) @@ -360,8 +375,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -378,8 +394,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -396,8 +413,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_add_u32 s36, s36, s5 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b @@ -411,27 +429,29 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX11-LABEL: test_call_external_void_func_i8_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_i8_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_i8(i8 123) ret void @@ -441,16 +461,17 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; VI-LABEL: test_call_external_void_func_i8_signext: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s38, -1 -; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -462,16 +483,17 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; ; CI-LABEL: test_call_external_void_func_i8_signext: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; CI-NEXT: s_mov_b32 s38, -1 -; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -483,16 +505,17 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; ; GFX9-LABEL: test_call_external_void_func_i8_signext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -506,32 +529,34 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_i8 v0, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_signext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_signext@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_i8_signext: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8_signext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8_signext@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %var = load volatile i8, ptr addrspace(1) undef call void @external_void_func_i8_signext(i8 signext %var) @@ -541,16 +566,17 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; VI-LABEL: test_call_external_void_func_i8_zeroext: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s38, -1 -; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -562,16 +588,17 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; ; CI-LABEL: test_call_external_void_func_i8_zeroext: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; CI-NEXT: s_mov_b32 s38, -1 -; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -583,16 +610,17 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; ; GFX9-LABEL: test_call_external_void_func_i8_zeroext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -606,32 +634,34 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_zeroext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_zeroext@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_i8_zeroext: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8_zeroext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8_zeroext@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %var = load volatile i8, ptr addrspace(1) undef call void @external_void_func_i8_zeroext(i8 zeroext %var) @@ -645,8 +675,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -663,8 +694,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -681,8 +713,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b @@ -696,27 +729,29 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; GFX11-LABEL: test_call_external_void_func_i16_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_i16_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_i16(i16 123) ret void @@ -725,16 +760,17 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; VI-LABEL: test_call_external_void_func_i16_signext: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s38, -1 -; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -746,16 +782,17 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; ; CI-LABEL: test_call_external_void_func_i16_signext: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; CI-NEXT: s_mov_b32 s38, -1 -; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -767,16 +804,17 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; ; GFX9-LABEL: test_call_external_void_func_i16_signext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -790,32 +828,34 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_i16 v0, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_signext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_signext@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_i16_signext: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_sshort v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16_signext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16_signext@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %var = load volatile i16, ptr addrspace(1) undef call void @external_void_func_i16_signext(i16 signext %var) @@ -825,16 +865,17 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; VI-LABEL: test_call_external_void_func_i16_zeroext: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_mov_b32 s38, -1 -; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -846,16 +887,17 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; ; CI-LABEL: test_call_external_void_func_i16_zeroext: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; CI-NEXT: s_mov_b32 s38, -1 -; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -867,16 +909,17 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; ; GFX9-LABEL: test_call_external_void_func_i16_zeroext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -890,32 +933,34 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_zeroext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_zeroext@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_i16_zeroext: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc ; HSA-NEXT: s_waitcnt vmcnt(0) -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16_zeroext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16_zeroext@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %var = load volatile i16, ptr addrspace(1) undef call void @external_void_func_i16_zeroext(i16 zeroext %var) @@ -929,8 +974,9 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 42 @@ -947,8 +993,9 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 42 @@ -965,8 +1012,9 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_add_u32 s36, s36, s5 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 42 @@ -980,27 +1028,29 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX11-LABEL: test_call_external_void_func_i32_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_i32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 42 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_i32(i32 42) ret void @@ -1013,8 +1063,9 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -1032,8 +1083,9 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -1051,8 +1103,9 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b @@ -1067,28 +1120,30 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; GFX11-LABEL: test_call_external_void_func_i64_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i64@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_i64_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: v_mov_b32_e32 v1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_i64(i64 123) ret void @@ -1101,8 +1156,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s1, s0 @@ -1123,8 +1179,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b32 s0, 0 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s1, s0 @@ -1145,8 +1202,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s1, s0 @@ -1163,36 +1221,38 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; ; GFX11-LABEL: test_call_external_void_func_v2i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s5, s4 ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v2i64: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s4, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_mov_b32 s7, 0x1100f000 -; HSA-NEXT: s_mov_b32 s6, -1 -; HSA-NEXT: s_mov_b32 s5, s4 -; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s8, 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 s9, s8 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <2 x i64>, ptr addrspace(1) null call void @external_void_func_v2i64(<2 x i64> %val) @@ -1206,8 +1266,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1227,8 +1288,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 @@ -1248,8 +1310,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -1267,30 +1330,32 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v2i64_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v2i64(<2 x i64> ) ret void @@ -1303,8 +1368,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s1, s0 @@ -1327,8 +1393,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b32 s0, 0 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s1, s0 @@ -1351,8 +1418,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s1, s0 @@ -1371,39 +1439,41 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; ; GFX11-LABEL: test_call_external_void_func_v3i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s5, s4 ; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2 -; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i64@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v3i64: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s4, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_mov_b32 s7, 0x1100f000 -; HSA-NEXT: s_mov_b32 s6, -1 -; HSA-NEXT: s_mov_b32 s5, s4 -; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s8, 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 s9, s8 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v4, 1 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v5, 2 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %load = load <2 x i64>, ptr addrspace(1) null %val = shufflevector <2 x i64> %load, <2 x i64> , <3 x i32> @@ -1419,8 +1489,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s1, s0 @@ -1445,8 +1516,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b32 s0, 0 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s1, s0 @@ -1471,8 +1543,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s1, s0 @@ -1493,42 +1566,44 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; ; GFX11-LABEL: test_call_external_void_func_v4i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s5, s4 ; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2 -; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 ; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i64@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v4i64: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_mov_b32 s4, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_mov_b32 s7, 0x1100f000 -; HSA-NEXT: s_mov_b32 s6, -1 -; HSA-NEXT: s_mov_b32 s5, s4 -; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s8, 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 s9, s8 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v4, 1 ; HSA-NEXT: v_mov_b32_e32 v5, 2 ; HSA-NEXT: v_mov_b32_e32 v6, 3 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v7, 4 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %load = load <2 x i64>, ptr addrspace(1) null %val = shufflevector <2 x i64> %load, <2 x i64> , <4 x i32> @@ -1543,8 +1618,9 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 @@ -1561,8 +1637,9 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 4.0 @@ -1579,8 +1656,9 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 @@ -1594,27 +1672,29 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; GFX11-LABEL: test_call_external_void_func_f16_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f16@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_f16_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0x4400 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_f16(half 4.0) ret void @@ -1627,8 +1707,9 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 4.0 @@ -1645,8 +1726,9 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 4.0 @@ -1663,8 +1745,9 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 @@ -1678,27 +1761,29 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; GFX11-LABEL: test_call_external_void_func_f32_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_f32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 4.0 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_f32(float 4.0) ret void @@ -1711,8 +1796,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -1730,8 +1816,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -1749,8 +1836,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 @@ -1765,28 +1853,30 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; GFX11-LABEL: test_call_external_void_func_v2f32_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v2f32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v2f32(<2 x float> ) ret void @@ -1799,8 +1889,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -1819,8 +1910,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -1839,8 +1931,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 @@ -1857,29 +1950,31 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v3f32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 4.0 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v3f32(<3 x float> ) ret void @@ -1892,8 +1987,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -1914,8 +2010,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -1936,8 +2033,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 @@ -1957,31 +2055,33 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5f32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v5f32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 4.0 ; HSA-NEXT: v_mov_b32_e32 v3, -1.0 ; HSA-NEXT: v_mov_b32_e32 v4, 0.5 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v5f32(<5 x float> ) ret void @@ -1994,8 +2094,9 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -2013,8 +2114,9 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -2032,8 +2134,9 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2048,28 +2151,30 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; GFX11-LABEL: test_call_external_void_func_f64_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f64@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_f64_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 0x40100000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_f64(double 4.0) ret void @@ -2082,8 +2187,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -2103,8 +2209,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -2124,8 +2231,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2143,30 +2251,32 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f64@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v2f64_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 ; HSA-NEXT: v_mov_b32_e32 v3, 0x40100000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v2f64(<2 x double> ) ret void @@ -2179,8 +2289,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -2202,8 +2313,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -2225,8 +2337,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2247,20 +2360,23 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f64@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v3f64_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 @@ -2268,11 +2384,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; HSA-NEXT: v_mov_b32_e32 v4, 0 ; HSA-NEXT: v_mov_b32_e32 v5, 0x40200000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v3f64(<3 x double> ) ret void @@ -2281,15 +2396,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; VI-LABEL: test_call_external_void_func_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -2301,15 +2417,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; ; CI-LABEL: test_call_external_void_func_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -2323,15 +2440,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; ; GFX9-LABEL: test_call_external_void_func_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -2345,30 +2463,32 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i16@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v2i16: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <2 x i16>, ptr addrspace(1) undef call void @external_void_func_v2i16(<2 x i16> %val) @@ -2378,15 +2498,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; VI-LABEL: test_call_external_void_func_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -2398,15 +2519,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; ; CI-LABEL: test_call_external_void_func_v3i16: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -2422,15 +2544,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; ; GFX9-LABEL: test_call_external_void_func_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -2444,30 +2567,32 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v3i16: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <3 x i16>, ptr addrspace(1) undef call void @external_void_func_v3i16(<3 x i16> %val) @@ -2477,15 +2602,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; VI-LABEL: test_call_external_void_func_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -2497,15 +2623,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; ; CI-LABEL: test_call_external_void_func_v3f16: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -2515,22 +2642,23 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: test_call_external_void_func_v3f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -2544,30 +2672,32 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v3f16: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <3 x half>, ptr addrspace(1) undef call void @external_void_func_v3f16(<3 x half> %val) @@ -2581,8 +2711,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x20001 @@ -2600,8 +2731,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 @@ -2620,8 +2752,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 @@ -2636,28 +2769,30 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; GFX11-LABEL: test_call_external_void_func_v3i16_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v3i16_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 ; HSA-NEXT: v_mov_b32_e32 v1, 3 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v3i16(<3 x i16> ) ret void @@ -2670,8 +2805,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x40003c00 @@ -2689,8 +2825,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -2709,8 +2846,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 @@ -2726,45 +2864,48 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v3f16_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; HSA-NEXT: v_mov_b32_e32 v1, 0x4400 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v3f16(<3 x half> ) ret void } define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { -; VI-LABEL: test_call_external_void_func_v4i16: -; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-LABEL: test_call_external_void_func_v4i16: +; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -2776,15 +2917,16 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; ; CI-LABEL: test_call_external_void_func_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -2801,15 +2943,16 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; ; GFX9-LABEL: test_call_external_void_func_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -2823,30 +2966,32 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v4i16: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <4 x i16>, ptr addrspace(1) undef call void @external_void_func_v4i16(<4 x i16> %val) @@ -2860,8 +3005,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x20001 @@ -2879,8 +3025,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 @@ -2900,8 +3047,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 @@ -2917,28 +3065,30 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v4i16_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 ; HSA-NEXT: v_mov_b32_e32 v1, 0x40003 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v4i16(<4 x i16> ) ret void @@ -2947,15 +3097,16 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; VI-LABEL: test_call_external_void_func_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -2967,15 +3118,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; ; CI-LABEL: test_call_external_void_func_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -2991,15 +3143,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; ; GFX9-LABEL: test_call_external_void_func_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -3013,30 +3166,32 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f16@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v2f16: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) undef call void @external_void_func_v2f16(<2 x half> %val) @@ -3046,15 +3201,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; VI-LABEL: test_call_external_void_func_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -3066,15 +3222,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; ; CI-LABEL: test_call_external_void_func_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -3086,15 +3243,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; ; GFX9-LABEL: test_call_external_void_func_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -3108,30 +3266,32 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v2i32: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <2 x i32>, ptr addrspace(1) undef call void @external_void_func_v2i32(<2 x i32> %val) @@ -3145,8 +3305,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -3164,8 +3325,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 @@ -3183,8 +3345,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -3199,28 +3362,30 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; GFX11-LABEL: test_call_external_void_func_v2i32_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v2i32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v2i32(<2 x i32> ) ret void @@ -3233,8 +3398,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 3 @@ -3253,8 +3419,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 3 @@ -3273,8 +3440,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_add_u32 s36, s36, s5 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 3 @@ -3291,29 +3459,31 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_mov_b32_e32 v2, 5 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v3i32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: v_mov_b32_e32 v1, 4 ; HSA-NEXT: v_mov_b32_e32 v2, 5 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v3i32(<3 x i32> ) ret void @@ -3326,8 +3496,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 3 @@ -3347,8 +3518,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 3 @@ -3368,8 +3540,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_add_u32 s36, s36, s5 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 3 @@ -3387,30 +3560,32 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32_i32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v3i32_i32: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: v_mov_b32_e32 v1, 4 ; HSA-NEXT: v_mov_b32_e32 v2, 5 ; HSA-NEXT: v_mov_b32_e32 v3, 6 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v3i32_i32(<3 x i32> , i32 6) ret void @@ -3419,15 +3594,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; VI-LABEL: test_call_external_void_func_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -3439,15 +3615,16 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; ; CI-LABEL: test_call_external_void_func_v4i32: ; CI: ; %bb.0: -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -3459,15 +3636,16 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; ; GFX9-LABEL: test_call_external_void_func_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -3481,30 +3659,32 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v4i32: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 ; HSA-NEXT: s_mov_b32 s6, -1 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <4 x i32>, ptr addrspace(1) undef call void @external_void_func_v4i32(<4 x i32> %val) @@ -3518,8 +3698,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -3539,8 +3720,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 @@ -3560,8 +3742,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -3579,30 +3762,32 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v4i32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v4i32(<4 x i32> ) ret void @@ -3615,8 +3800,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -3637,8 +3823,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 @@ -3659,8 +3846,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -3680,31 +3868,33 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: v_mov_b32_e32 v4, 5 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5i32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v5i32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: v_mov_b32_e32 v4, 5 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v5i32(<5 x i32> ) ret void @@ -3714,17 +3904,18 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; VI-LABEL: test_call_external_void_func_v8i32: ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -3737,17 +3928,18 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; CI-LABEL: test_call_external_void_func_v8i32: ; CI: ; %bb.0: ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -3760,17 +3952,18 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; GFX9-LABEL: test_call_external_void_func_v8i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -3782,39 +3975,40 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; ; GFX11-LABEL: test_call_external_void_func_v8i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32@rel32@hi+12 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v8i32: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_mov_b32 s7, 0x1100f000 -; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_waitcnt lgkmcnt(0) -; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v8i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v8i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef %val = load <8 x i32>, ptr addrspace(1) %ptr @@ -3829,8 +4023,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -3854,8 +4049,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 @@ -3879,8 +4075,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -3904,20 +4101,23 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6 ; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v8i32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 @@ -3927,11 +4127,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; HSA-NEXT: v_mov_b32_e32 v6, 7 ; HSA-NEXT: v_mov_b32_e32 v7, 8 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v8i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v8i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v8i32(<8 x i32> ) ret void @@ -3941,19 +4140,20 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; VI-LABEL: test_call_external_void_func_v16i32: ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -3966,19 +4166,20 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; CI-LABEL: test_call_external_void_func_v16i32: ; CI: ; %bb.0: ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -3991,19 +4192,20 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; GFX9-LABEL: test_call_external_void_func_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -4015,43 +4217,44 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; ; GFX11-LABEL: test_call_external_void_func_v16i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i32@rel32@hi+12 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 -; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32 -; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v16i32: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_mov_b32 s7, 0x1100f000 -; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_waitcnt lgkmcnt(0) -; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v16i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v16i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i32>, ptr addrspace(1) %ptr @@ -4062,147 +4265,152 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-LABEL: test_call_external_void_func_v32i32: ; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 -; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 +; VI-NEXT: s_getpc_b64 s[8:9] +; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 -; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_swappc_b64 s[30:31], s[8:9] ; VI-NEXT: s_endpgm ; ; CI-LABEL: test_call_external_void_func_v32i32: ; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 -; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 +; CI-NEXT: s_getpc_b64 s[8:9] +; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 -; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_swappc_b64 s[30:31], s[8:9] ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: test_call_external_void_func_v32i32: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 -; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v32i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v32i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v32i32@rel32@hi+12 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[0:3], 0 offset:112 -; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 -; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32 -; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48 -; GFX11-NEXT: buffer_load_b128 v[16:19], off, s[0:3], 0 offset:64 -; GFX11-NEXT: buffer_load_b128 v[20:23], off, s[0:3], 0 offset:80 -; GFX11-NEXT: buffer_load_b128 v[24:27], off, s[0:3], 0 offset:96 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 +; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[16:19], off, s[4:7], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(7) ; GFX11-NEXT: scratch_store_b32 off, v31, s32 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v32i32: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_mov_b32 s7, 0x1100f000 -; HSA-NEXT: s_mov_b32 s6, -1 -; HSA-NEXT: s_waitcnt lgkmcnt(0) -; HSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 ; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 +; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 +; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_getpc_b64 s[12:13] +; HSA-NEXT: s_add_u32 s12, s12, external_void_func_v32i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s13, s13, external_void_func_v32i32@rel32@hi+12 ; HSA-NEXT: s_waitcnt vmcnt(7) ; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_swappc_b64 s[30:31], s[12:13] ; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef %val = load <32 x i32>, ptr addrspace(1) %ptr @@ -4214,24 +4422,25 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; VI-LABEL: test_call_external_void_func_v32i32_i32: ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s3 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], 0 -; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 -; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[4:7], 0 +; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -4248,24 +4457,25 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; CI-LABEL: test_call_external_void_func_v32i32_i32: ; CI: ; %bb.0: ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s3 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dword v32, off, s[0:3], 0 -; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 -; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dword v32, off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -4282,24 +4492,25 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX9-LABEL: test_call_external_void_func_v32i32_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], 0 -; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 -; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -4315,61 +4526,63 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; ; GFX11-LABEL: test_call_external_void_func_v32i32_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v32i32_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v32i32_i32@rel32@hi+12 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[0:3], 0 offset:112 -; GFX11-NEXT: buffer_load_b32 v32, off, s[0:3], 0 -; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 -; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32 -; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48 -; GFX11-NEXT: buffer_load_b128 v[16:19], off, s[0:3], 0 offset:64 -; GFX11-NEXT: buffer_load_b128 v[20:23], off, s[0:3], 0 offset:80 -; GFX11-NEXT: buffer_load_b128 v[24:27], off, s[0:3], 0 offset:96 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 -; GFX11-NEXT: s_add_i32 s2, s32, 4 +; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112 +; GFX11-NEXT: buffer_load_b32 v32, off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[16:19], off, s[4:7], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_add_i32 s4, s32, 4 ; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: scratch_store_b32 off, v31, s32 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: scratch_store_b32 off, v32, s2 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: scratch_store_b32 off, v32, s4 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v32i32_i32: ; HSA: ; %bb.0: -; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 s7, 0x1100f000 -; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) -; HSA-NEXT: buffer_load_dword v32, off, s[4:7], 0 -; HSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 +; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 +; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32_i32@rel32@hi+12 ; HSA-NEXT: s_waitcnt vmcnt(8) ; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; HSA-NEXT: s_waitcnt vmcnt(8) ; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef %val0 = load <32 x i32>, ptr addrspace(1) %ptr0 @@ -4385,9 +4598,10 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; VI-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s42, -1 ; VI-NEXT: s_mov_b32 s43, 0xe80000 -; VI-NEXT: s_add_u32 s40, s40, s3 -; VI-NEXT: s_load_dwordx2 s[36:37], s[0:1], 0x24 +; VI-NEXT: s_add_u32 s40, s40, s5 +; VI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 ; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[40:41] ; VI-NEXT: s_mov_b64 s[2:3], s[42:43] ; VI-NEXT: v_mov_b32_e32 v0, 42 @@ -4408,9 +4622,10 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; CI-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s42, -1 ; CI-NEXT: s_mov_b32 s43, 0xe8f000 -; CI-NEXT: s_add_u32 s40, s40, s3 -; CI-NEXT: s_load_dwordx2 s[36:37], s[0:1], 0x9 +; CI-NEXT: s_add_u32 s40, s40, s5 +; CI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 ; CI-NEXT: s_addc_u32 s41, s41, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[40:41] ; CI-NEXT: s_mov_b64 s[2:3], s[42:43] ; CI-NEXT: v_mov_b32_e32 v0, 42 @@ -4431,9 +4646,10 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s42, -1 ; GFX9-NEXT: s_mov_b32 s43, 0xe00000 -; GFX9-NEXT: s_add_u32 s40, s40, s3 -; GFX9-NEXT: s_load_dwordx2 s[36:37], s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 s40, s40, s5 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-NEXT: v_mov_b32_e32 v0, 42 @@ -4450,36 +4666,38 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; ; GFX11-LABEL: test_call_external_i32_func_i32_imm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[36:37], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[36:37], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_mov_b32 s39, 0x31016000 ; GFX11-NEXT: s_mov_b32 s38, -1 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_i32_func_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_i32_func_i32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: buffer_store_b32 v0, off, s[36:39], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_i32_func_i32_imm: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_load_dwordx2 s[36:37], s[6:7], 0x0 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 42 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 ; HSA-NEXT: s_mov_b32 s39, 0x1100f000 ; HSA-NEXT: s_mov_b32 s38, -1 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_i32_func_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_i32_func_i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: s_endpgm @@ -4492,17 +4710,18 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; VI-LABEL: test_call_external_void_func_struct_i8_i32: ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 @@ -4515,17 +4734,18 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; CI-LABEL: test_call_external_void_func_struct_i8_i32: ; CI: ; %bb.0: ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 @@ -4538,17 +4758,18 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX9-LABEL: test_call_external_void_func_struct_i8_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -4560,39 +4781,40 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; ; GFX11-LABEL: test_call_external_void_func_struct_i8_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_struct_i8_i32@rel32@hi+12 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: buffer_load_u8 v0, off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0 offset:4 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_struct_i8_i32: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_mov_b32 s7, 0x1100f000 -; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_waitcnt lgkmcnt(0) -; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; HSA-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 +; HSA-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; HSA-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_struct_i8_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_struct_i8_i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef %val = load { i8, i32 }, ptr addrspace(1) %ptr0 @@ -4607,7 +4829,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: buffer_store_byte v0, off, s[36:39], 0 @@ -4615,6 +4837,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 ; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_movk_i32 s32, 0x400 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -4634,7 +4857,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: v_mov_b32_e32 v0, 3 ; CI-NEXT: buffer_store_byte v0, off, s[36:39], 0 @@ -4642,6 +4865,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 ; CI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_movk_i32 s32, 0x400 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -4661,7 +4885,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0 @@ -4670,6 +4894,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -4687,23 +4912,24 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 ; GFX11-NEXT: s_mov_b32 s32, 16 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b8 off, v0, off ; GFX11-NEXT: scratch_store_b32 off, v1, off offset:4 ; GFX11-NEXT: scratch_load_b64 v[0:1], off, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_add_u32 s0, s0, s7 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -4712,15 +4938,16 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; HSA-NEXT: s_movk_i32 s32, 0x400 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; HSA-NEXT: s_waitcnt vmcnt(1) ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; HSA-NEXT: s_waitcnt vmcnt(1) ; HSA-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = alloca { i8, i32 }, align 8, addrspace(5) %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 0 @@ -4738,7 +4965,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: buffer_store_byte v0, off, s[36:39], 0 @@ -4747,6 +4974,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; VI-NEXT: s_movk_i32 s32, 0x800 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_getpc_b64 s[4:5] @@ -4775,7 +5003,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: v_mov_b32_e32 v0, 3 ; CI-NEXT: buffer_store_byte v0, off, s[36:39], 0 @@ -4784,6 +5012,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; CI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; CI-NEXT: s_movk_i32 s32, 0x800 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_getpc_b64 s[4:5] @@ -4812,7 +5041,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_add_u32 s36, s36, s5 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0 @@ -4822,6 +5051,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -4848,9 +5078,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 ; GFX11-NEXT: s_mov_b32 s32, 32 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b8 off, v0, off ; GFX11-NEXT: scratch_store_b32 off, v1, off offset:4 @@ -4858,7 +5089,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 ; GFX11-NEXT: v_mov_b32_e32 v0, 8 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_load_u8 v0, off, off offset:8 ; GFX11-NEXT: scratch_load_b32 v1, off, off offset:12 @@ -4876,9 +5107,9 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; ; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -4887,16 +5118,17 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; HSA-NEXT: s_movk_i32 s32, 0x800 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; HSA-NEXT: s_waitcnt vmcnt(1) ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; HSA-NEXT: s_waitcnt vmcnt(1) ; HSA-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; HSA-NEXT: v_mov_b32_e32 v0, 8 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:8 ; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12 ; HSA-NEXT: s_mov_b32 s7, 0x1100f000 @@ -4928,19 +5160,20 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; VI-LABEL: test_call_external_void_func_v16i8: ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_add_u32 s36, s36, s1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 @@ -4969,19 +5202,20 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; CI-LABEL: test_call_external_void_func_v16i8: ; CI: ; %bb.0: ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_add_u32 s36, s36, s1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 @@ -5010,19 +5244,20 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; GFX9-LABEL: test_call_external_void_func_v16i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 @@ -5050,15 +5285,16 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; ; GFX11-LABEL: test_call_external_void_func_v16i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i8@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i8@rel32@hi+12 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v16i8@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v16i8@rel32@hi+12 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v0 @@ -5076,25 +5312,26 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; GFX11-NEXT: v_mov_b32_e32 v8, v2 ; GFX11-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18 ; GFX11-NEXT: v_mov_b32_e32 v2, v17 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: test_call_external_void_func_v16i8: ; HSA: ; %bb.0: -; HSA-NEXT: s_add_i32 s4, s4, s7 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s5 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; HSA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HSA-NEXT: s_add_u32 s0, s0, s7 -; HSA-NEXT: s_mov_b32 s7, 0x1100f000 -; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_waitcnt lgkmcnt(0) -; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v16i8@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v16i8@rel32@hi+12 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: v_lshrrev_b32_e32 v16, 8, v0 ; HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 @@ -5114,7 +5351,7 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; HSA-NEXT: v_mov_b32_e32 v1, v16 ; HSA-NEXT: v_mov_b32_e32 v2, v17 ; HSA-NEXT: v_mov_b32_e32 v3, v18 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i8>, ptr addrspace(1) %ptr @@ -5129,18 +5366,19 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; VI-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s54, -1 ; VI-NEXT: s_mov_b32 s55, 0xe80000 -; VI-NEXT: s_add_u32 s52, s52, s3 -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xa4 -; VI-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x24 +; VI-NEXT: s_add_u32 s52, s52, s5 +; VI-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa4 +; VI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_addc_u32 s53, s53, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s19 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v0, s23 ; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 -; VI-NEXT: buffer_store_dword v1, off, s[52:55], s32 offset:4 -; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[52:53] ; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 ; VI-NEXT: s_mov_b64 s[2:3], s[54:55] @@ -5160,25 +5398,25 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; VI-NEXT: v_mov_b32_e32 v13, s49 ; VI-NEXT: v_mov_b32_e32 v14, s50 ; VI-NEXT: v_mov_b32_e32 v15, s51 -; VI-NEXT: v_mov_b32_e32 v16, s4 -; VI-NEXT: v_mov_b32_e32 v17, s5 -; VI-NEXT: v_mov_b32_e32 v18, s6 -; VI-NEXT: v_mov_b32_e32 v19, s7 -; VI-NEXT: v_mov_b32_e32 v20, s8 -; VI-NEXT: v_mov_b32_e32 v21, s9 -; VI-NEXT: v_mov_b32_e32 v22, s10 -; VI-NEXT: v_mov_b32_e32 v23, s11 -; VI-NEXT: v_mov_b32_e32 v24, s12 -; VI-NEXT: v_mov_b32_e32 v25, s13 -; VI-NEXT: v_mov_b32_e32 v26, s14 -; VI-NEXT: v_mov_b32_e32 v27, s15 -; VI-NEXT: v_mov_b32_e32 v28, s16 -; VI-NEXT: v_mov_b32_e32 v29, s17 -; VI-NEXT: v_mov_b32_e32 v30, s18 -; VI-NEXT: s_getpc_b64 s[20:21] -; VI-NEXT: s_add_u32 s20, s20, stack_passed_f64_arg@rel32@lo+4 -; VI-NEXT: s_addc_u32 s21, s21, stack_passed_f64_arg@rel32@hi+12 -; VI-NEXT: s_swappc_b64 s[30:31], s[20:21] +; VI-NEXT: v_mov_b32_e32 v16, s8 +; VI-NEXT: v_mov_b32_e32 v17, s9 +; VI-NEXT: v_mov_b32_e32 v18, s10 +; VI-NEXT: v_mov_b32_e32 v19, s11 +; VI-NEXT: v_mov_b32_e32 v20, s12 +; VI-NEXT: v_mov_b32_e32 v21, s13 +; VI-NEXT: v_mov_b32_e32 v22, s14 +; VI-NEXT: v_mov_b32_e32 v23, s15 +; VI-NEXT: v_mov_b32_e32 v24, s16 +; VI-NEXT: v_mov_b32_e32 v25, s17 +; VI-NEXT: v_mov_b32_e32 v26, s18 +; VI-NEXT: v_mov_b32_e32 v27, s19 +; VI-NEXT: v_mov_b32_e32 v28, s20 +; VI-NEXT: v_mov_b32_e32 v29, s21 +; VI-NEXT: v_mov_b32_e32 v30, s22 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; ; CI-LABEL: stack_passed_arg_alignment_v32i32_f64: @@ -5187,18 +5425,19 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; CI-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s54, -1 ; CI-NEXT: s_mov_b32 s55, 0xe8f000 -; CI-NEXT: s_add_u32 s52, s52, s3 -; CI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x29 -; CI-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x9 +; CI-NEXT: s_add_u32 s52, s52, s5 +; CI-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x19 +; CI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x29 +; CI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x9 ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_addc_u32 s53, s53, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s19 -; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v0, s23 ; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 -; CI-NEXT: buffer_store_dword v1, off, s[52:55], s32 offset:4 -; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 +; CI-NEXT: v_mov_b32_e32 v0, s5 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[52:53] ; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 ; CI-NEXT: s_mov_b64 s[2:3], s[54:55] @@ -5218,25 +5457,25 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; CI-NEXT: v_mov_b32_e32 v13, s49 ; CI-NEXT: v_mov_b32_e32 v14, s50 ; CI-NEXT: v_mov_b32_e32 v15, s51 -; CI-NEXT: v_mov_b32_e32 v16, s4 -; CI-NEXT: v_mov_b32_e32 v17, s5 -; CI-NEXT: v_mov_b32_e32 v18, s6 -; CI-NEXT: v_mov_b32_e32 v19, s7 -; CI-NEXT: v_mov_b32_e32 v20, s8 -; CI-NEXT: v_mov_b32_e32 v21, s9 -; CI-NEXT: v_mov_b32_e32 v22, s10 -; CI-NEXT: v_mov_b32_e32 v23, s11 -; CI-NEXT: v_mov_b32_e32 v24, s12 -; CI-NEXT: v_mov_b32_e32 v25, s13 -; CI-NEXT: v_mov_b32_e32 v26, s14 -; CI-NEXT: v_mov_b32_e32 v27, s15 -; CI-NEXT: v_mov_b32_e32 v28, s16 -; CI-NEXT: v_mov_b32_e32 v29, s17 -; CI-NEXT: v_mov_b32_e32 v30, s18 -; CI-NEXT: s_getpc_b64 s[20:21] -; CI-NEXT: s_add_u32 s20, s20, stack_passed_f64_arg@rel32@lo+4 -; CI-NEXT: s_addc_u32 s21, s21, stack_passed_f64_arg@rel32@hi+12 -; CI-NEXT: s_swappc_b64 s[30:31], s[20:21] +; CI-NEXT: v_mov_b32_e32 v16, s8 +; CI-NEXT: v_mov_b32_e32 v17, s9 +; CI-NEXT: v_mov_b32_e32 v18, s10 +; CI-NEXT: v_mov_b32_e32 v19, s11 +; CI-NEXT: v_mov_b32_e32 v20, s12 +; CI-NEXT: v_mov_b32_e32 v21, s13 +; CI-NEXT: v_mov_b32_e32 v22, s14 +; CI-NEXT: v_mov_b32_e32 v23, s15 +; CI-NEXT: v_mov_b32_e32 v24, s16 +; CI-NEXT: v_mov_b32_e32 v25, s17 +; CI-NEXT: v_mov_b32_e32 v26, s18 +; CI-NEXT: v_mov_b32_e32 v27, s19 +; CI-NEXT: v_mov_b32_e32 v28, s20 +; CI-NEXT: v_mov_b32_e32 v29, s21 +; CI-NEXT: v_mov_b32_e32 v30, s22 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64: @@ -5245,18 +5484,19 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GFX9-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s54, -1 ; GFX9-NEXT: s_mov_b32 s55, 0xe00000 -; GFX9-NEXT: s_add_u32 s52, s52, s3 -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xa4 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 s52, s52, s5 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa4 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_addc_u32 s53, s53, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s19 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s23 ; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 -; GFX9-NEXT: buffer_store_dword v1, off, s[52:55], s32 offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[52:53] ; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 ; GFX9-NEXT: s_mov_b64 s[2:3], s[54:55] @@ -5276,43 +5516,43 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GFX9-NEXT: v_mov_b32_e32 v13, s49 ; GFX9-NEXT: v_mov_b32_e32 v14, s50 ; GFX9-NEXT: v_mov_b32_e32 v15, s51 -; GFX9-NEXT: v_mov_b32_e32 v16, s4 -; GFX9-NEXT: v_mov_b32_e32 v17, s5 -; GFX9-NEXT: v_mov_b32_e32 v18, s6 -; GFX9-NEXT: v_mov_b32_e32 v19, s7 -; GFX9-NEXT: v_mov_b32_e32 v20, s8 -; GFX9-NEXT: v_mov_b32_e32 v21, s9 -; GFX9-NEXT: v_mov_b32_e32 v22, s10 -; GFX9-NEXT: v_mov_b32_e32 v23, s11 -; GFX9-NEXT: v_mov_b32_e32 v24, s12 -; GFX9-NEXT: v_mov_b32_e32 v25, s13 -; GFX9-NEXT: v_mov_b32_e32 v26, s14 -; GFX9-NEXT: v_mov_b32_e32 v27, s15 -; GFX9-NEXT: v_mov_b32_e32 v28, s16 -; GFX9-NEXT: v_mov_b32_e32 v29, s17 -; GFX9-NEXT: v_mov_b32_e32 v30, s18 -; GFX9-NEXT: s_getpc_b64 s[20:21] -; GFX9-NEXT: s_add_u32 s20, s20, stack_passed_f64_arg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s21, s21, stack_passed_f64_arg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v25, s17 +; GFX9-NEXT: v_mov_b32_e32 v26, s18 +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v30, s22 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0xa4 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 -; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[20:21], s[2:3], 0xa4 +; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 +; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s20, s32, 8 +; GFX11-NEXT: s_add_i32 s22, s32, 8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s21 :: v_dual_mov_b32 v1, s20 ; GFX11-NEXT: v_mov_b32_e32 v2, s19 -; GFX11-NEXT: s_add_i32 s2, s32, 4 +; GFX11-NEXT: s_add_i32 s19, s32, 4 ; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s43 -; GFX11-NEXT: scratch_store_b32 off, v0, s20 -; GFX11-NEXT: scratch_store_b32 off, v1, s2 +; GFX11-NEXT: scratch_store_b32 off, v0, s22 +; GFX11-NEXT: scratch_store_b32 off, v1, s19 ; GFX11-NEXT: scratch_store_b32 off, v2, s32 ; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s39 ; GFX11-NEXT: v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v2, s38 @@ -5329,31 +5569,33 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GFX11-NEXT: v_dual_mov_b32 v27, s15 :: v_dual_mov_b32 v26, s14 ; GFX11-NEXT: v_dual_mov_b32 v29, s17 :: v_dual_mov_b32 v28, s16 ; GFX11-NEXT: v_mov_b32_e32 v30, s18 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, stack_passed_f64_arg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, stack_passed_f64_arg@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; ; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64: ; HSA: ; %bb.0: ; %entry -; HSA-NEXT: s_add_i32 s6, s6, s9 -; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; HSA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x80 -; HSA-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 +; HSA-NEXT: s_load_dwordx2 s[24:25], s[6:7], 0x80 +; HSA-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: v_mov_b32_e32 v0, s23 -; HSA-NEXT: v_mov_b32_e32 v1, s6 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; HSA-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; HSA-NEXT: v_mov_b32_e32 v0, s7 +; HSA-NEXT: v_mov_b32_e32 v0, s24 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: v_mov_b32_e32 v0, s25 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, s36 ; HSA-NEXT: v_mov_b32_e32 v1, s37 ; HSA-NEXT: v_mov_b32_e32 v2, s38 @@ -5385,10 +5627,10 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; HSA-NEXT: v_mov_b32_e32 v28, s20 ; HSA-NEXT: v_mov_b32_e32 v29, s21 ; HSA-NEXT: v_mov_b32_e32 v30, s22 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 -; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: s_getpc_b64 s[24:25] +; HSA-NEXT: s_add_u32 s24, s24, stack_passed_f64_arg@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s25, s25, stack_passed_f64_arg@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[24:25] ; HSA-NEXT: s_endpgm entry: call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp) diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll index 06dec7e792389f..3626b2b316fba7 100644 --- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll @@ -10,9 +10,9 @@ declare hidden void @callee() #0 define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_size !0 { ; CHECK-LABEL: known_x_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 @@ -30,9 +30,9 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_size !1 { ; CHECK-LABEL: known_y_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -49,9 +49,9 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_size !2 { ; CHECK-LABEL: known_z_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -68,9 +68,9 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_size !3 { ; CHECK-LABEL: known_yz_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -87,9 +87,9 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_size !4 { ; CHECK-LABEL: known_xz_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -107,9 +107,9 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_size !5 { ; CHECK-LABEL: known_xyz_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v31, 0 ; CHECK-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll index 60f2dc1ce414d0..10f0efea59b607 100644 --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -5,19 +5,20 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 { ; GCN-LABEL: call_memory_arg_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_load_dword s6, s[6:7], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GCN-NEXT: s_add_u32 s0, s0, s11 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: s_endpgm %vgpr = load volatile i32, ptr addrspace(3) %ptr call void @func(i32 %vgpr) @@ -28,20 +29,21 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 { define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 { ; GCN-LABEL: call_memory_no_dep: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GCN-NEXT: s_add_u32 s0, s0, s11 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_store_dword v0, v0, s[4:5] +; GCN-NEXT: global_store_dword v0, v0, s[6:7] +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: s_endpgm store i32 0, ptr addrspace(1) %ptr call void @func(i32 0) @@ -52,18 +54,19 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 { define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GCN-NEXT: s_add_u32 s0, s0, s11 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12 +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 ; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: global_store_dword v40, v40, s[34:35] ; GCN-NEXT: s_endpgm call void @func(i32 0) @@ -74,18 +77,19 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) # define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call_return_val: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GCN-NEXT: s_add_u32 s0, s0, s11 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12 +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, func.return@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, func.return@rel32@hi+12 ; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: global_store_dword v40, v0, s[34:35] ; GCN-NEXT: s_endpgm %rv = call i32 @func.return(i32 0) @@ -97,18 +101,19 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) % define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 { ; GCN-LABEL: call_got_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GCN-NEXT: s_add_u32 s0, s0, s11 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: s_endpgm call void @got.func(i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index 8ef2d89e76d4e1..8fde0dd2d28ed4 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -30,7 +30,7 @@ define hidden void @use_queue_ptr() #1 { } ; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr: -; GCN: s_swappc_b64 s[30:31], s[4:5] +; GCN: s_swappc_b64 s[30:31], s[10:11] ; GCN: .amdhsa_user_sgpr_queue_ptr 0 define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 { call void @use_queue_ptr() @@ -489,7 +489,7 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 { ; We have to pass the kernarg segment, but there are no kernel ; arguments so null is passed. ; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input_no_kernargs: -; GCN: s_mov_b64 s[10:11], s[6:7] +; GCN: s_mov_b64 s[10:11], s[8:9] ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index b334047d325555..0009a84765639c 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -8,7 +8,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; SI-LABEL: kernel: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -18,7 +18,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; VI-LABEL: kernel: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -28,7 +28,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; GFX11-LABEL: kernel: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -117,24 +117,25 @@ define amdgpu_kernel void @call_coldcc() #0 { ; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; SI-NEXT: s_mov_b32 s22, -1 ; SI-NEXT: s_mov_b32 s23, 0xe8f000 -; SI-NEXT: s_add_u32 s20, s20, s9 +; SI-NEXT: s_add_u32 s20, s20, s11 ; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_mov_b32 s14, s8 -; SI-NEXT: s_mov_b64 s[10:11], s[4:5] -; SI-NEXT: s_add_u32 s8, s2, 36 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s13, s9 +; SI-NEXT: s_mov_b32 s12, s8 +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: s_add_u32 s8, s4, 36 ; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; SI-NEXT: s_addc_u32 s9, s3, 0 -; SI-NEXT: s_getpc_b64 s[2:3] -; SI-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 -; SI-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 -; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; SI-NEXT: s_addc_u32 s9, s5, 0 +; SI-NEXT: s_getpc_b64 s[4:5] +; SI-NEXT: s_add_u32 s4, s4, coldcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s5, s5, coldcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v31, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_mov_b64 s[0:1], s[20:21] ; SI-NEXT: s_mov_b64 s[2:3], s[22:23] ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -150,24 +151,25 @@ define amdgpu_kernel void @call_coldcc() #0 { ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s9 +; VI-NEXT: s_add_u32 s88, s88, s11 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_mov_b32 s14, s8 -; VI-NEXT: s_add_u32 s8, s2, 36 -; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: s_getpc_b64 s[2:3] -; VI-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 -; VI-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 -; VI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s12, s8 +; VI-NEXT: s_add_u32 s8, s4, 36 +; VI-NEXT: s_mov_b32 s13, s9 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, coldcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, coldcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; VI-NEXT: s_mov_b64 s[10:11], s[4:5] +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b64 s[10:11], s[6:7] ; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_mov_b64 s[4:5], s[0:1] +; VI-NEXT: s_mov_b64 s[6:7], s[2:3] ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] ; VI-NEXT: v_or_b32_e32 v31, v0, v2 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 @@ -178,21 +180,22 @@ define amdgpu_kernel void @call_coldcc() #0 { ; ; GFX11-LABEL: call_coldcc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_u32 s8, s2, 36 -; GFX11-NEXT: s_addc_u32 s9, s3, 0 -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 +; GFX11-NEXT: s_add_u32 s8, s4, 36 +; GFX11-NEXT: s_addc_u32 s9, s5, 0 +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_add_u32 s4, s4, coldcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s5, s5, coldcc@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX11-NEXT: s_mov_b32 s13, s14 ; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm %val = call float @coldcc(float 1.0) @@ -208,24 +211,25 @@ define amdgpu_kernel void @call_fastcc() #0 { ; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; SI-NEXT: s_mov_b32 s22, -1 ; SI-NEXT: s_mov_b32 s23, 0xe8f000 -; SI-NEXT: s_add_u32 s20, s20, s9 +; SI-NEXT: s_add_u32 s20, s20, s11 ; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_mov_b32 s14, s8 -; SI-NEXT: s_mov_b64 s[10:11], s[4:5] -; SI-NEXT: s_add_u32 s8, s2, 36 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s13, s9 +; SI-NEXT: s_mov_b32 s12, s8 +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: s_add_u32 s8, s4, 36 ; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; SI-NEXT: s_addc_u32 s9, s3, 0 -; SI-NEXT: s_getpc_b64 s[2:3] -; SI-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 -; SI-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 -; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; SI-NEXT: s_addc_u32 s9, s5, 0 +; SI-NEXT: s_getpc_b64 s[4:5] +; SI-NEXT: s_add_u32 s4, s4, fastcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s5, s5, fastcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v31, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_mov_b64 s[0:1], s[20:21] ; SI-NEXT: s_mov_b64 s[2:3], s[22:23] ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -241,24 +245,25 @@ define amdgpu_kernel void @call_fastcc() #0 { ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s9 +; VI-NEXT: s_add_u32 s88, s88, s11 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_mov_b32 s14, s8 -; VI-NEXT: s_add_u32 s8, s2, 36 -; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: s_getpc_b64 s[2:3] -; VI-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 -; VI-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 -; VI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s12, s8 +; VI-NEXT: s_add_u32 s8, s4, 36 +; VI-NEXT: s_mov_b32 s13, s9 +; VI-NEXT: s_addc_u32 s9, s5, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, fastcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, fastcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; VI-NEXT: s_mov_b64 s[10:11], s[4:5] +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b64 s[10:11], s[6:7] ; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_mov_b64 s[4:5], s[0:1] +; VI-NEXT: s_mov_b64 s[6:7], s[2:3] ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] ; VI-NEXT: v_or_b32_e32 v31, v0, v2 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 @@ -269,21 +274,22 @@ define amdgpu_kernel void @call_fastcc() #0 { ; ; GFX11-LABEL: call_fastcc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_u32 s8, s2, 36 -; GFX11-NEXT: s_addc_u32 s9, s3, 0 -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 +; GFX11-NEXT: s_add_u32 s8, s4, 36 +; GFX11-NEXT: s_addc_u32 s9, s5, 0 +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_add_u32 s4, s4, fastcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s5, s5, fastcc@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX11-NEXT: s_mov_b32 s13, s14 ; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm %val = call float @fastcc(float 1.0) @@ -986,7 +992,7 @@ define amdgpu_ps i16 @ret_ps_mesa_i16() { define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; SI-LABEL: amd_kernel_i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s0, s0, s0 @@ -997,7 +1003,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; VI-LABEL: amd_kernel_i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s0, s0, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1006,7 +1012,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; GFX11-LABEL: amd_kernel_i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s0, s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1022,7 +1028,7 @@ entry: define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; SI-LABEL: amd_kernel_v2i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 +; SI-NEXT: s_load_dword s1, s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1040,7 +1046,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v2i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1056,7 +1062,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v2i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bfe_u32 s1, s0, 0x80008 @@ -1078,7 +1084,7 @@ entry: define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; SI-LABEL: amd_kernel_v4i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 +; SI-NEXT: s_load_dword s1, s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1106,7 +1112,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v4i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1132,7 +1138,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v4i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 @@ -1164,7 +1170,7 @@ entry: define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; SI-LABEL: amd_kernel_v3i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s0, 2 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1188,7 +1194,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v3i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 2 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1210,7 +1216,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v3i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 @@ -1237,7 +1243,7 @@ entry: define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; SI-LABEL: amd_kernel_v5i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s0, 4 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1270,7 +1276,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v5i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 4 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1302,7 +1308,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v5i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s2, s0, 16 @@ -1338,7 +1344,7 @@ entry: define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; SI-LABEL: amd_kernel_v8i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1383,7 +1389,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v8i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1426,7 +1432,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v8i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s2, s0, 16 ; GFX11-NEXT: s_lshr_b32 s3, s0, 24 @@ -1474,7 +1480,7 @@ entry: define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; SI-LABEL: amd_kernel_v16i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1553,7 +1559,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v16i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1630,7 +1636,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v16i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s6, s1, 16 ; GFX11-NEXT: s_lshr_b32 s7, s1, 24 @@ -1710,7 +1716,7 @@ entry: define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; SI-LABEL: amd_kernel_v32i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s9, 0 ; SI-NEXT: s_mov_b32 s8, 16 ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1860,7 +1866,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v32i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v4, 16 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2008,7 +2014,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v32i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s16, s0, 16 ; GFX11-NEXT: s_lshr_b32 s17, s0, 24 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 8352376a9c1371..fc896150591528 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -18,100 +18,100 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; CISI-LABEL: sadd64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s3, 0xf000 -; CISI-NEXT: s_mov_b32 s2, -1 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CISI-NEXT: s_mov_b32 s7, 0xf000 +; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s0, s4 -; CISI-NEXT: s_add_u32 s4, s6, s8 -; CISI-NEXT: s_mov_b32 s1, s5 -; CISI-NEXT: s_addc_u32 s5, s7, s9 -; CISI-NEXT: v_mov_b32_e32 v0, s4 -; CISI-NEXT: v_mov_b32_e32 v1, s5 -; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CISI-NEXT: s_mov_b32 s4, s0 +; CISI-NEXT: s_add_u32 s0, s2, s8 +; CISI-NEXT: s_mov_b32 s5, s1 +; CISI-NEXT: s_addc_u32 s1, s3, s9 +; CISI-NEXT: v_mov_b32_e32 v0, s0 +; CISI-NEXT: v_mov_b32_e32 v1, s1 +; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: sadd64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_add_u32 s0, s6, s0 -; VI-NEXT: s_addc_u32 s1, s7, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s2, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s3, s5 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: sadd64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_add_u32 s2, s2, s6 +; GFX9-NEXT: s_addc_u32 s3, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: sadd64rr: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_add_u32 s0, s6, s0 -; GFX1010-NEXT: s_addc_u32 s1, s7, s1 -; GFX1010-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1010-NEXT: s_add_u32 s2, s2, s6 +; GFX1010-NEXT: s_addc_u32 s3, s3, s7 +; GFX1010-NEXT: v_mov_b32_e32 v0, s2 +; GFX1010-NEXT: v_mov_b32_e32 v1, s3 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: sadd64rr: ; GFX1030W32: ; %bb.0: ; %entry ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: s_add_u32 s0, s6, s0 -; GFX1030W32-NEXT: s_addc_u32 s1, s7, s1 -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1 -; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1030W32-NEXT: s_add_u32 s2, s2, s4 +; GFX1030W32-NEXT: s_addc_u32 s3, s3, s5 +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W32-NEXT: s_endpgm ; ; GFX1030W64-LABEL: sadd64rr: ; GFX1030W64: ; %bb.0: ; %entry ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: s_add_u32 s0, s6, s0 -; GFX1030W64-NEXT: s_addc_u32 s1, s7, s1 -; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1030W64-NEXT: s_add_u32 s2, s2, s4 +; GFX1030W64-NEXT: s_addc_u32 s3, s3, s5 +; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W64-NEXT: s_endpgm ; ; GFX11-LABEL: sadd64rr: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_u32 s0, s6, s0 -; GFX11-NEXT: s_addc_u32 s1, s7, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_add_u32 s2, s2, s4 +; GFX11-NEXT: s_addc_u32 s3, s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm entry: %add = add i64 %a, %b @@ -127,7 +127,7 @@ entry: define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: sadd64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -142,7 +142,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: sadd64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s2, 0x56789876 @@ -155,31 +155,31 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: sadd64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s6, 0x56789876 -; GFX9-NEXT: s_addc_u32 s1, s7, 0x1234 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_add_u32 s2, s2, 0x56789876 +; GFX9-NEXT: s_addc_u32 s3, s3, 0x1234 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: sadd64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_add_u32 s0, s6, 0x56789876 -; GFX1010-NEXT: s_addc_u32 s1, s7, 0x1234 -; GFX1010-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1010-NEXT: s_add_u32 s2, s2, 0x56789876 +; GFX1010-NEXT: s_addc_u32 s3, s3, 0x1234 +; GFX1010-NEXT: v_mov_b32_e32 v0, s2 +; GFX1010-NEXT: v_mov_b32_e32 v1, s3 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: sadd64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -191,7 +191,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: sadd64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -203,7 +203,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: sadd64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876 ; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234 @@ -225,7 +225,7 @@ entry: define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: vadd64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -239,7 +239,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: vadd64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 @@ -251,28 +251,28 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: vadd64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: vadd64rr: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_add_co_u32 v0, s0, s6, v0 -; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, s7, 0, s0 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1010-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: vadd64rr: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_add_co_u32 v0, s2, s2, v0 @@ -282,7 +282,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: vadd64rr: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s2, v0 @@ -292,7 +292,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: vadd64rr: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -317,7 +317,7 @@ entry: define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; CISI-LABEL: vadd64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CISI-NEXT: v_add_i32_e32 v0, vcc, 0x56789876, v0 ; CISI-NEXT: v_mov_b32_e32 v1, 0x1234 ; CISI-NEXT: s_mov_b32 s3, 0xf000 @@ -329,7 +329,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; VI-LABEL: vadd64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x56789876, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0x1234 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -341,7 +341,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX9-LABEL: vadd64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x56789876, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -352,8 +352,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1010-LABEL: vadd64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1010-NEXT: s_mov_b32 null, 0 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2 @@ -363,7 +362,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1030W32-LABEL: vadd64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2 @@ -373,7 +372,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1030W64-LABEL: vadd64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[2:3] @@ -383,7 +382,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX11-LABEL: vadd64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -407,20 +406,20 @@ entry: define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: suaddo32: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; CISI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_add_i32 s4, s4, s5 +; CISI-NEXT: s_add_i32 s4, s6, s7 ; CISI-NEXT: v_mov_b32_e32 v0, s4 ; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: suaddo32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -431,32 +430,32 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: suaddo32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: suaddo32: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_i32 s0, s0, s1 ; GFX1010-NEXT: v_mov_b32_e32 v1, s0 -; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1010-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: suaddo32: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_i32 s0, s0, s1 @@ -467,8 +466,8 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-LABEL: suaddo32: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_i32 s0, s0, s1 @@ -479,8 +478,8 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-LABEL: suaddo32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -507,35 +506,35 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: uaddo32_vcc_user: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s11, 0xf000 -; CISI-NEXT: s_mov_b32 s10, -1 -; CISI-NEXT: s_mov_b32 s2, s10 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CISI-NEXT: s_mov_b32 s7, 0xf000 +; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s8, s4 -; CISI-NEXT: v_mov_b32_e32 v0, s13 -; CISI-NEXT: s_mov_b32 s9, s5 -; CISI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 -; CISI-NEXT: s_mov_b32 s0, s6 -; CISI-NEXT: s_mov_b32 s1, s7 -; CISI-NEXT: s_mov_b32 s3, s11 +; CISI-NEXT: s_mov_b32 s4, s0 +; CISI-NEXT: v_mov_b32_e32 v0, s9 +; CISI-NEXT: s_mov_b32 s5, s1 +; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 +; CISI-NEXT: s_mov_b32 s0, s2 +; CISI-NEXT: s_mov_b32 s1, s3 +; CISI-NEXT: s_mov_b32 s2, s6 +; CISI-NEXT: s_mov_b32 s3, s7 ; CISI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CISI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CISI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CISI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: uaddo32_vcc_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: flat_store_byte v[2:3], v5 @@ -543,38 +542,38 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: uaddo32_vcc_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: global_store_byte v0, v2, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: uaddo32_vcc_user: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_add_co_u32 v1, s0, s0, s1 -; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] -; GFX1010-NEXT: global_store_byte v0, v2, s[6:7] +; GFX1010-NEXT: v_add_co_u32 v1, s4, s6, s7 +; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX1010-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1010-NEXT: global_store_byte v0, v2, s[2:3] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: uaddo32_vcc_user: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: v_add_co_u32 v1, s4, s4, s5 +; GFX1030W32-NEXT: v_add_co_u32 v1, s4, s6, s7 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030W32-NEXT: global_store_byte v0, v2, s[2:3] @@ -583,11 +582,11 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W64-LABEL: uaddo32_vcc_user: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: v_add_co_u32 v1, s[4:5], s4, s5 +; GFX1030W64-NEXT: v_add_co_u32 v1, s[4:5], s6, s7 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030W64-NEXT: global_store_byte v0, v2, s[2:3] @@ -596,11 +595,11 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: uaddo32_vcc_user: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v1, s4, s4, s5 +; GFX11-NEXT: v_add_co_u32 v1, s4, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX11-NEXT: s_clause 0x1 @@ -623,7 +622,7 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; CISI-LABEL: suaddo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CISI-NEXT: s_mov_b32 s11, 0xf000 ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -647,7 +646,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: suaddo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 @@ -667,39 +666,39 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: suaddo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s8, s10 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s9, s11 +; GFX9-NEXT: s_add_u32 s0, s12, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s13, s15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] -; GFX9-NEXT: global_store_byte v4, v0, s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] +; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: suaddo64: ; GFX1010: ; %bb.0: -; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_add_u32 s0, s8, s10 -; GFX1010-NEXT: s_addc_u32 s1, s9, s11 +; GFX1010-NEXT: s_add_u32 s0, s12, s14 +; GFX1010-NEXT: s_addc_u32 s1, s13, s15 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9] +; GFX1010-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13] ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX1010-NEXT: global_store_byte v2, v3, s[6:7] +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX1010-NEXT: global_store_byte v2, v3, s[10:11] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: suaddo64: ; GFX1030W32: ; %bb.0: -; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s6, s4, s6 @@ -714,7 +713,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W64-LABEL: suaddo64: ; GFX1030W64: ; %bb.0: -; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s6, s4, s6 @@ -729,7 +728,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX11-LABEL: suaddo64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s6, s4, s6 ; GFX11-NEXT: s_addc_u32 s7, s5, s7 @@ -758,22 +757,22 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 { ; CISI-LABEL: vuaddo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s11, 0xf000 -; CISI-NEXT: s_mov_b32 s10, -1 -; CISI-NEXT: s_mov_b32 s2, s10 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CISI-NEXT: s_mov_b32 s7, 0xf000 +; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s8, s4 -; CISI-NEXT: v_mov_b32_e32 v1, s13 -; CISI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 +; CISI-NEXT: s_mov_b32 s4, s0 +; CISI-NEXT: v_mov_b32_e32 v1, s9 +; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 ; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] -; CISI-NEXT: s_mov_b32 s9, s5 -; CISI-NEXT: s_mov_b32 s0, s6 -; CISI-NEXT: s_mov_b32 s1, s7 -; CISI-NEXT: s_mov_b32 s3, s11 -; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; CISI-NEXT: s_mov_b32 s5, s1 +; CISI-NEXT: s_mov_b32 s0, s2 +; CISI-NEXT: s_mov_b32 s1, s3 +; CISI-NEXT: s_mov_b32 s2, s6 +; CISI-NEXT: s_mov_b32 s3, s7 +; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CISI-NEXT: s_waitcnt expcnt(0) ; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -781,17 +780,17 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: vuaddo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v6, s1 -; VI-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[5:6] -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v3, s6 -; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6] +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] ; VI-NEXT: flat_store_byte v[3:4], v0 @@ -799,44 +798,44 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: vuaddo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_byte v2, v0, s[6:7] +; GFX9-NEXT: global_store_byte v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: vuaddo64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_add_co_u32 v0, s2, s0, v0 -; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s1, 0, s2 -; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX1010-NEXT: v_add_co_u32 v0, s4, s6, v0 +; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4 +; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1] ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX1010-NEXT: global_store_byte v2, v3, s[6:7] +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-NEXT: global_store_byte v2, v3, s[2:3] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: vuaddo64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: v_add_co_u32 v0, s6, s4, v0 -; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6 -; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1030W32-NEXT: v_add_co_u32 v0, s4, s6, v0 +; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 +; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1] ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3] @@ -845,13 +844,13 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-LABEL: vuaddo64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: v_add_co_u32 v0, s[6:7], s4, v0 -; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s[6:7] -; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] +; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s6, v0 +; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s[4:5] +; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3] @@ -860,16 +859,16 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-LABEL: vuaddo64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, s6, s4, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6 +; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1] ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -893,100 +892,100 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; CISI-LABEL: ssub64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s3, 0xf000 -; CISI-NEXT: s_mov_b32 s2, -1 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CISI-NEXT: s_mov_b32 s7, 0xf000 +; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s0, s4 -; CISI-NEXT: s_sub_u32 s4, s6, s8 -; CISI-NEXT: s_mov_b32 s1, s5 -; CISI-NEXT: s_subb_u32 s5, s7, s9 -; CISI-NEXT: v_mov_b32_e32 v0, s4 -; CISI-NEXT: v_mov_b32_e32 v1, s5 -; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CISI-NEXT: s_mov_b32 s4, s0 +; CISI-NEXT: s_sub_u32 s0, s2, s8 +; CISI-NEXT: s_mov_b32 s5, s1 +; CISI-NEXT: s_subb_u32 s1, s3, s9 +; CISI-NEXT: v_mov_b32_e32 v0, s0 +; CISI-NEXT: v_mov_b32_e32 v1, s1 +; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: ssub64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_sub_u32 s0, s6, s0 -; VI-NEXT: s_subb_u32 s1, s7, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_u32 s0, s2, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_subb_u32 s1, s3, s5 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: ssub64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s6, s0 -; GFX9-NEXT: s_subb_u32 s1, s7, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_sub_u32 s2, s2, s6 +; GFX9-NEXT: s_subb_u32 s3, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: ssub64rr: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_sub_u32 s0, s6, s0 -; GFX1010-NEXT: s_subb_u32 s1, s7, s1 -; GFX1010-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1010-NEXT: s_sub_u32 s2, s2, s6 +; GFX1010-NEXT: s_subb_u32 s3, s3, s7 +; GFX1010-NEXT: v_mov_b32_e32 v0, s2 +; GFX1010-NEXT: v_mov_b32_e32 v1, s3 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: ssub64rr: ; GFX1030W32: ; %bb.0: ; %entry ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: s_sub_u32 s0, s6, s0 -; GFX1030W32-NEXT: s_subb_u32 s1, s7, s1 -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1 -; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1030W32-NEXT: s_sub_u32 s2, s2, s4 +; GFX1030W32-NEXT: s_subb_u32 s3, s3, s5 +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W32-NEXT: s_endpgm ; ; GFX1030W64-LABEL: ssub64rr: ; GFX1030W64: ; %bb.0: ; %entry ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: s_sub_u32 s0, s6, s0 -; GFX1030W64-NEXT: s_subb_u32 s1, s7, s1 -; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1030W64-NEXT: s_sub_u32 s2, s2, s4 +; GFX1030W64-NEXT: s_subb_u32 s3, s3, s5 +; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W64-NEXT: s_endpgm ; ; GFX11-LABEL: ssub64rr: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_u32 s0, s6, s0 -; GFX11-NEXT: s_subb_u32 s1, s7, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sub_u32 s2, s2, s4 +; GFX11-NEXT: s_subb_u32 s3, s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm entry: %sub = sub i64 %a, %b @@ -1002,7 +1001,7 @@ entry: define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: ssub64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1017,7 +1016,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: ssub64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, 0x56789876, s2 @@ -1030,31 +1029,31 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: ssub64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, 0x56789876, s6 -; GFX9-NEXT: s_subb_u32 s1, 0x1234, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_sub_u32 s2, 0x56789876, s2 +; GFX9-NEXT: s_subb_u32 s3, 0x1234, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: ssub64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_sub_u32 s0, 0x56789876, s6 -; GFX1010-NEXT: s_subb_u32 s1, 0x1234, s7 -; GFX1010-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1010-NEXT: s_sub_u32 s2, 0x56789876, s2 +; GFX1010-NEXT: s_subb_u32 s3, 0x1234, s3 +; GFX1010-NEXT: v_mov_b32_e32 v0, s2 +; GFX1010-NEXT: v_mov_b32_e32 v1, s3 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: ssub64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1066,7 +1065,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: ssub64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1078,7 +1077,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: ssub64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2 ; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3 @@ -1100,7 +1099,7 @@ entry: define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: vsub64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1114,7 +1113,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: vsub64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_sub_u32_e32 v3, vcc, s2, v0 @@ -1126,28 +1125,28 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: vsub64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: vsub64rr: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_sub_co_u32 v0, s0, s6, v0 -; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, s7, 0, s0 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1010-NEXT: v_sub_co_u32 v0, s2, s2, v0 +; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s3, 0, s2 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: vsub64rr: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, s2, v0 @@ -1157,7 +1156,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: vsub64rr: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s2, v0 @@ -1167,7 +1166,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: vsub64rr: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1192,7 +1191,7 @@ entry: define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; CISI-LABEL: vsub64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CISI-NEXT: v_sub_i32_e32 v0, vcc, 0x56789876, v0 ; CISI-NEXT: v_mov_b32_e32 v1, 0x1234 ; CISI-NEXT: s_mov_b32 s3, 0xf000 @@ -1204,7 +1203,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; VI-LABEL: vsub64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x56789876, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0x1234 ; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -1216,7 +1215,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX9-LABEL: vsub64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0x56789876, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1227,8 +1226,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1010-LABEL: vsub64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1010-NEXT: s_mov_b32 null, 0 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1010-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, 0x1234, 0, s2 @@ -1238,7 +1236,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1030W32-LABEL: vsub64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2 @@ -1248,7 +1246,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1030W64-LABEL: vsub64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[2:3], 0x56789876, v0 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3] @@ -1258,7 +1256,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX11-LABEL: vsub64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1283,20 +1281,20 @@ entry: define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: susubo32: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; CISI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_sub_i32 s4, s4, s5 +; CISI-NEXT: s_sub_i32 s4, s6, s7 ; CISI-NEXT: v_mov_b32_e32 v0, s4 ; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: susubo32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sub_i32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1307,32 +1305,32 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: susubo32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_i32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: susubo32: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_i32 s0, s0, s1 ; GFX1010-NEXT: v_mov_b32_e32 v1, s0 -; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1010-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: susubo32: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_i32 s0, s0, s1 @@ -1343,8 +1341,8 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-LABEL: susubo32: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_i32 s0, s0, s1 @@ -1355,8 +1353,8 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-LABEL: susubo32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1383,35 +1381,35 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: usubo32_vcc_user: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s11, 0xf000 -; CISI-NEXT: s_mov_b32 s10, -1 -; CISI-NEXT: s_mov_b32 s2, s10 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CISI-NEXT: s_mov_b32 s7, 0xf000 +; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s8, s4 -; CISI-NEXT: v_mov_b32_e32 v0, s13 -; CISI-NEXT: s_mov_b32 s9, s5 -; CISI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 -; CISI-NEXT: s_mov_b32 s0, s6 -; CISI-NEXT: s_mov_b32 s1, s7 -; CISI-NEXT: s_mov_b32 s3, s11 +; CISI-NEXT: s_mov_b32 s4, s0 +; CISI-NEXT: v_mov_b32_e32 v0, s9 +; CISI-NEXT: s_mov_b32 s5, s1 +; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; CISI-NEXT: s_mov_b32 s0, s2 +; CISI-NEXT: s_mov_b32 s1, s3 +; CISI-NEXT: s_mov_b32 s2, s6 +; CISI-NEXT: s_mov_b32 s3, s7 ; CISI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CISI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CISI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CISI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: usubo32_vcc_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_sub_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: flat_store_byte v[2:3], v5 @@ -1419,38 +1417,38 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: usubo32_vcc_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: global_store_byte v0, v2, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: usubo32_vcc_user: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_sub_co_u32 v1, s0, s0, s1 -; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] -; GFX1010-NEXT: global_store_byte v0, v2, s[6:7] +; GFX1010-NEXT: v_sub_co_u32 v1, s4, s6, s7 +; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX1010-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1010-NEXT: global_store_byte v0, v2, s[2:3] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: usubo32_vcc_user: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: v_sub_co_u32 v1, s4, s4, s5 +; GFX1030W32-NEXT: v_sub_co_u32 v1, s4, s6, s7 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030W32-NEXT: global_store_byte v0, v2, s[2:3] @@ -1459,11 +1457,11 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W64-LABEL: usubo32_vcc_user: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], s4, s5 +; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], s6, s7 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030W64-NEXT: global_store_byte v0, v2, s[2:3] @@ -1472,11 +1470,11 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: usubo32_vcc_user: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_co_u32 v1, s4, s4, s5 +; GFX11-NEXT: v_sub_co_u32 v1, s4, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX11-NEXT: s_clause 0x1 @@ -1499,7 +1497,7 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; CISI-LABEL: susubo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CISI-NEXT: s_mov_b32 s11, 0xf000 ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1523,7 +1521,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: susubo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, s4, s6 @@ -1543,39 +1541,39 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: susubo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s8, s10 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_subb_u32 s1, s9, s11 +; GFX9-NEXT: s_sub_u32 s0, s12, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_subb_u32 s1, s13, s15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] -; GFX9-NEXT: global_store_byte v4, v0, s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] +; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: susubo64: ; GFX1010: ; %bb.0: -; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_sub_u32 s0, s8, s10 -; GFX1010-NEXT: s_subb_u32 s1, s9, s11 +; GFX1010-NEXT: s_sub_u32 s0, s12, s14 +; GFX1010-NEXT: s_subb_u32 s1, s13, s15 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9] +; GFX1010-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13] ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX1010-NEXT: global_store_byte v2, v3, s[6:7] +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX1010-NEXT: global_store_byte v2, v3, s[10:11] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: susubo64: ; GFX1030W32: ; %bb.0: -; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s6, s4, s6 @@ -1590,7 +1588,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W64-LABEL: susubo64: ; GFX1030W64: ; %bb.0: -; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s6, s4, s6 @@ -1605,7 +1603,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX11-LABEL: susubo64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s6, s4, s6 ; GFX11-NEXT: s_subb_u32 s7, s5, s7 @@ -1634,22 +1632,22 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 { ; CISI-LABEL: vusubo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s11, 0xf000 -; CISI-NEXT: s_mov_b32 s10, -1 -; CISI-NEXT: s_mov_b32 s2, s10 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CISI-NEXT: s_mov_b32 s7, 0xf000 +; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s8, s4 -; CISI-NEXT: v_mov_b32_e32 v1, s13 -; CISI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 +; CISI-NEXT: s_mov_b32 s4, s0 +; CISI-NEXT: v_mov_b32_e32 v1, s9 +; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 ; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; CISI-NEXT: s_mov_b32 s9, s5 -; CISI-NEXT: s_mov_b32 s0, s6 -; CISI-NEXT: s_mov_b32 s1, s7 -; CISI-NEXT: s_mov_b32 s3, s11 -; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; CISI-NEXT: s_mov_b32 s5, s1 +; CISI-NEXT: s_mov_b32 s0, s2 +; CISI-NEXT: s_mov_b32 s1, s3 +; CISI-NEXT: s_mov_b32 s2, s6 +; CISI-NEXT: s_mov_b32 s3, s7 +; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CISI-NEXT: s_waitcnt expcnt(0) ; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -1657,17 +1655,17 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: vusubo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v6, s1 -; VI-NEXT: v_sub_u32_e32 v5, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v0 ; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[5:6] -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v3, s6 -; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[5:6] +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] ; VI-NEXT: flat_store_byte v[3:4], v0 @@ -1675,44 +1673,44 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: vusubo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_byte v2, v0, s[6:7] +; GFX9-NEXT: global_store_byte v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX1010-LABEL: vusubo64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_sub_co_u32 v0, s2, s0, v0 -; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s1, 0, s2 -; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX1010-NEXT: v_sub_co_u32 v0, s4, s6, v0 +; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4 +; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1] ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX1010-NEXT: global_store_byte v2, v3, s[6:7] +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1010-NEXT: global_store_byte v2, v3, s[2:3] ; GFX1010-NEXT: s_endpgm ; ; GFX1030W32-LABEL: vusubo64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: v_sub_co_u32 v0, s6, s4, v0 -; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6 -; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1030W32-NEXT: v_sub_co_u32 v0, s4, s6, v0 +; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s4 +; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1] ; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3] @@ -1721,13 +1719,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-LABEL: vusubo64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: v_sub_co_u32 v0, s[6:7], s4, v0 -; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s[6:7] -; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s6, v0 +; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s[4:5] +; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3] @@ -1736,16 +1734,16 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-LABEL: vusubo64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_co_u32 v0, s6, s4, v0 -; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6 +; GFX11-NEXT: v_sub_co_u32 v0, s4, s6, v0 +; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1] ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -1772,10 +1770,10 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; CISI-LABEL: sudiv64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd +; CISI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CISI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] +; CISI-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] ; CISI-NEXT: s_mov_b32 s0, 0 ; CISI-NEXT: v_cmp_ne_u64_e64 s[0:1], s[0:1], 0 ; CISI-NEXT: s_and_b64 vcc, exec, s[0:1] @@ -1837,15 +1835,15 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; CISI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CISI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CISI-NEXT: v_mul_lo_u32 v2, s6, v1 -; CISI-NEXT: v_mul_hi_u32 v3, s6, v0 -; CISI-NEXT: v_mul_hi_u32 v4, s6, v1 -; CISI-NEXT: v_mul_hi_u32 v5, s7, v1 -; CISI-NEXT: v_mul_lo_u32 v1, s7, v1 +; CISI-NEXT: v_mul_lo_u32 v2, s10, v1 +; CISI-NEXT: v_mul_hi_u32 v3, s10, v0 +; CISI-NEXT: v_mul_hi_u32 v4, s10, v1 +; CISI-NEXT: v_mul_hi_u32 v5, s11, v1 +; CISI-NEXT: v_mul_lo_u32 v1, s11, v1 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CISI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; CISI-NEXT: v_mul_lo_u32 v4, s7, v0 -; CISI-NEXT: v_mul_hi_u32 v0, s7, v0 +; CISI-NEXT: v_mul_lo_u32 v4, s11, v0 +; CISI-NEXT: v_mul_hi_u32 v0, s11, v0 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CISI-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; CISI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -1858,8 +1856,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CISI-NEXT: v_mul_lo_u32 v3, s2, v0 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CISI-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 -; CISI-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 +; CISI-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 +; CISI-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 ; CISI-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc ; CISI-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 ; CISI-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] @@ -1876,7 +1874,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; CISI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; CISI-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] ; CISI-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] -; CISI-NEXT: v_mov_b32_e32 v6, s7 +; CISI-NEXT: v_mov_b32_e32 v6, s11 ; CISI-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc ; CISI-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; CISI-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc @@ -1897,10 +1895,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; CISI-NEXT: v_mul_lo_u32 v1, s0, v0 ; CISI-NEXT: v_mul_hi_u32 v1, v0, v1 ; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CISI-NEXT: v_mul_hi_u32 v0, s6, v0 +; CISI-NEXT: v_mul_hi_u32 v0, s10, v0 ; CISI-NEXT: v_readfirstlane_b32 s0, v0 ; CISI-NEXT: s_mul_i32 s0, s0, s2 -; CISI-NEXT: s_sub_i32 s0, s6, s0 +; CISI-NEXT: s_sub_i32 s0, s10, s0 ; CISI-NEXT: s_sub_i32 s1, s0, s2 ; CISI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 ; CISI-NEXT: s_cmp_ge_u32 s0, s2 @@ -1913,9 +1911,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; CISI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CISI-NEXT: v_mov_b32_e32 v1, 0 ; CISI-NEXT: .LBB16_3: -; CISI-NEXT: s_mov_b32 s7, 0xf000 -; CISI-NEXT: s_mov_b32 s6, -1 -; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CISI-NEXT: s_mov_b32 s11, 0xf000 +; CISI-NEXT: s_mov_b32 s10, -1 +; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; CISI-NEXT: s_endpgm ; CISI-NEXT: .LBB16_4: ; CISI-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1923,18 +1921,18 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; VI-LABEL: sudiv64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] +; VI-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] ; VI-NEXT: s_mov_b32 s0, 0 ; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_cbranch_scc0 .LBB16_4 ; VI-NEXT: ; %bb.1: ; VI-NEXT: v_cvt_f32_u32_e32 v0, s2 ; VI-NEXT: v_cvt_f32_u32_e32 v1, s3 -; VI-NEXT: s_sub_u32 s8, 0, s2 -; VI-NEXT: s_subb_u32 s9, 0, s3 +; VI-NEXT: s_sub_u32 s4, 0, s2 +; VI-NEXT: s_subb_u32 s5, 0, s3 ; VI-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; VI-NEXT: v_rcp_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1943,9 +1941,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v4, v1 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v0 -; VI-NEXT: v_mul_lo_u32 v2, s8, v4 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 -; VI-NEXT: v_mul_lo_u32 v3, s9, v5 +; VI-NEXT: v_mul_lo_u32 v2, s4, v4 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v5, 0 +; VI-NEXT: v_mul_lo_u32 v3, s5, v5 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 ; VI-NEXT: v_mul_hi_u32 v6, v5, v0 @@ -1961,9 +1959,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, v4, v1, vcc -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 -; VI-NEXT: v_mul_lo_u32 v4, s8, v7 -; VI-NEXT: v_mul_lo_u32 v5, s9, v6 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v6, 0 +; VI-NEXT: v_mul_lo_u32 v4, s4, v7 +; VI-NEXT: v_mul_lo_u32 v5, s5, v6 ; VI-NEXT: v_mul_hi_u32 v8, v6, v0 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v0, 0 ; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1 @@ -1979,33 +1977,33 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v6, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, v7, v1, vcc -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0 -; VI-NEXT: v_mul_hi_u32 v4, s6, v2 -; VI-NEXT: v_readfirstlane_b32 s8, v1 -; VI-NEXT: v_readfirstlane_b32 s9, v0 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v3, 0 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s7, v2, 0 -; VI-NEXT: v_readfirstlane_b32 s10, v4 -; VI-NEXT: s_add_u32 s0, s10, s9 -; VI-NEXT: s_addc_u32 s1, 0, s8 -; VI-NEXT: v_readfirstlane_b32 s10, v2 -; VI-NEXT: v_readfirstlane_b32 s9, v3 -; VI-NEXT: s_add_u32 s0, s0, s10 -; VI-NEXT: v_readfirstlane_b32 s8, v1 -; VI-NEXT: s_addc_u32 s0, s1, s9 -; VI-NEXT: s_addc_u32 s10, s8, 0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 +; VI-NEXT: v_mul_hi_u32 v4, s10, v2 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s11, v3, 0 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s11, v2, 0 +; VI-NEXT: v_readfirstlane_b32 s6, v4 +; VI-NEXT: s_add_u32 s0, s6, s5 +; VI-NEXT: s_addc_u32 s1, 0, s4 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_readfirstlane_b32 s5, v3 +; VI-NEXT: s_add_u32 s0, s0, s6 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_addc_u32 s0, s1, s5 +; VI-NEXT: s_addc_u32 s6, s4, 0 ; VI-NEXT: v_readfirstlane_b32 s1, v0 -; VI-NEXT: s_add_u32 s11, s0, s1 -; VI-NEXT: v_mov_b32_e32 v2, s11 +; VI-NEXT: s_add_u32 s7, s0, s1 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, 0 -; VI-NEXT: s_addc_u32 s10, 0, s10 -; VI-NEXT: s_mul_i32 s0, s2, s10 +; VI-NEXT: s_addc_u32 s6, 0, s6 +; VI-NEXT: s_mul_i32 s0, s2, s6 ; VI-NEXT: v_readfirstlane_b32 s1, v1 ; VI-NEXT: s_add_i32 s0, s1, s0 -; VI-NEXT: s_mul_i32 s1, s3, s11 +; VI-NEXT: s_mul_i32 s1, s3, s7 ; VI-NEXT: s_add_i32 s12, s0, s1 -; VI-NEXT: s_sub_i32 s0, s7, s12 -; VI-NEXT: v_sub_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: s_sub_i32 s0, s11, s12 +; VI-NEXT: v_sub_u32_e32 v0, vcc, s10, v0 ; VI-NEXT: s_cmp_lg_u64 vcc, 0 ; VI-NEXT: s_subb_u32 s13, s0, s3 ; VI-NEXT: v_subrev_u32_e64 v1, s[0:1], s2, v0 @@ -2019,19 +2017,19 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_mov_b32_e32 v3, s14 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] -; VI-NEXT: s_add_u32 s0, s11, 1 -; VI-NEXT: s_addc_u32 s13, s10, 0 -; VI-NEXT: s_add_u32 s1, s11, 2 -; VI-NEXT: s_addc_u32 s11, s10, 0 +; VI-NEXT: s_add_u32 s0, s7, 1 +; VI-NEXT: s_addc_u32 s13, s6, 0 +; VI-NEXT: s_add_u32 s1, s7, 2 +; VI-NEXT: s_addc_u32 s7, s6, 0 ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 ; VI-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: v_mov_b32_e32 v4, s11 +; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: s_cmp_lg_u64 vcc, 0 ; VI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; VI-NEXT: s_subb_u32 s0, s7, s12 +; VI-NEXT: s_subb_u32 s0, s11, s12 ; VI-NEXT: s_cmp_ge_u32 s0, s3 ; VI-NEXT: s_cselect_b32 s1, -1, 0 ; VI-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 @@ -2040,7 +2038,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -2054,10 +2052,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_mul_lo_u32 v1, s0, v0 ; VI-NEXT: v_mul_hi_u32 v1, v0, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; VI-NEXT: v_mul_hi_u32 v0, s6, v0 +; VI-NEXT: v_mul_hi_u32 v0, s10, v0 ; VI-NEXT: v_readfirstlane_b32 s0, v0 ; VI-NEXT: s_mul_i32 s0, s0, s2 -; VI-NEXT: s_sub_i32 s0, s6, s0 +; VI-NEXT: s_sub_i32 s0, s10, s0 ; VI-NEXT: s_sub_i32 s1, s0, s2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 ; VI-NEXT: s_cmp_ge_u32 s0, s2 @@ -2070,8 +2068,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: .LBB16_3: -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB16_4: @@ -2080,18 +2078,18 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GFX9-LABEL: sudiv64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b64 s[0:1], s[6:7], s[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_sub_u32 s0, 0, s8 -; GFX9-NEXT: s_subb_u32 s1, 0, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: s_sub_u32 s0, 0, s2 +; GFX9-NEXT: s_subb_u32 s1, 0, s3 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2100,96 +2098,96 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s10, v1 -; GFX9-NEXT: v_readfirstlane_b32 s11, v0 -; GFX9-NEXT: s_mul_i32 s12, s0, s10 -; GFX9-NEXT: s_mul_hi_u32 s14, s0, s11 -; GFX9-NEXT: s_mul_i32 s13, s1, s11 +; GFX9-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s12, s0, s6 +; GFX9-NEXT: s_mul_hi_u32 s14, s0, s7 +; GFX9-NEXT: s_mul_i32 s13, s1, s7 ; GFX9-NEXT: s_add_i32 s12, s14, s12 ; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s15, s0, s11 -; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 -; GFX9-NEXT: s_mul_i32 s14, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s11, s11, s15 -; GFX9-NEXT: s_add_u32 s11, s11, s14 +; GFX9-NEXT: s_mul_i32 s15, s0, s7 +; GFX9-NEXT: s_mul_hi_u32 s13, s7, s12 +; GFX9-NEXT: s_mul_i32 s14, s7, s12 +; GFX9-NEXT: s_mul_hi_u32 s7, s7, s15 +; GFX9-NEXT: s_add_u32 s7, s7, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s10, s15 -; GFX9-NEXT: s_mul_i32 s15, s10, s15 -; GFX9-NEXT: s_add_u32 s11, s11, s15 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s12 -; GFX9-NEXT: s_addc_u32 s11, s13, s16 +; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15 +; GFX9-NEXT: s_mul_i32 s15, s6, s15 +; GFX9-NEXT: s_add_u32 s7, s7, s15 +; GFX9-NEXT: s_mul_hi_u32 s14, s6, s12 +; GFX9-NEXT: s_addc_u32 s7, s13, s16 ; GFX9-NEXT: s_addc_u32 s13, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s11, s11, s12 +; GFX9-NEXT: s_mul_i32 s12, s6, s12 +; GFX9-NEXT: s_add_u32 s7, s7, s12 ; GFX9-NEXT: s_addc_u32 s12, 0, s13 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s11, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s7, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s10, s10, s12 +; GFX9-NEXT: s_addc_u32 s6, s6, s12 ; GFX9-NEXT: v_readfirstlane_b32 s12, v0 -; GFX9-NEXT: s_mul_i32 s11, s0, s10 +; GFX9-NEXT: s_mul_i32 s7, s0, s6 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 -; GFX9-NEXT: s_add_i32 s11, s13, s11 +; GFX9-NEXT: s_add_i32 s7, s13, s7 ; GFX9-NEXT: s_mul_i32 s1, s1, s12 -; GFX9-NEXT: s_add_i32 s11, s11, s1 +; GFX9-NEXT: s_add_i32 s7, s7, s1 ; GFX9-NEXT: s_mul_i32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0 -; GFX9-NEXT: s_mul_i32 s14, s10, s0 -; GFX9-NEXT: s_mul_i32 s16, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s13, s6, s0 +; GFX9-NEXT: s_mul_i32 s14, s6, s0 +; GFX9-NEXT: s_mul_i32 s16, s12, s7 ; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s7 ; GFX9-NEXT: s_add_u32 s0, s0, s16 ; GFX9-NEXT: s_addc_u32 s12, 0, s15 ; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s1, s6, s7 ; GFX9-NEXT: s_addc_u32 s0, s12, s13 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s11, s10, s11 -; GFX9-NEXT: s_add_u32 s0, s0, s11 +; GFX9-NEXT: s_mul_i32 s7, s6, s7 +; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s0, s10, s1 -; GFX9-NEXT: v_readfirstlane_b32 s11, v0 -; GFX9-NEXT: s_mul_i32 s10, s6, s0 -; GFX9-NEXT: s_mul_hi_u32 s12, s6, s11 -; GFX9-NEXT: s_mul_hi_u32 s1, s6, s0 -; GFX9-NEXT: s_add_u32 s10, s12, s10 +; GFX9-NEXT: s_addc_u32 s0, s6, s1 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s10, s0 +; GFX9-NEXT: s_mul_hi_u32 s12, s10, s7 +; GFX9-NEXT: s_mul_hi_u32 s1, s10, s0 +; GFX9-NEXT: s_add_u32 s6, s12, s6 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: s_mul_hi_u32 s13, s7, s11 -; GFX9-NEXT: s_mul_i32 s11, s7, s11 -; GFX9-NEXT: s_add_u32 s10, s10, s11 -; GFX9-NEXT: s_mul_hi_u32 s12, s7, s0 +; GFX9-NEXT: s_mul_hi_u32 s13, s11, s7 +; GFX9-NEXT: s_mul_i32 s7, s11, s7 +; GFX9-NEXT: s_add_u32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s12, s11, s0 ; GFX9-NEXT: s_addc_u32 s1, s1, s13 -; GFX9-NEXT: s_addc_u32 s10, s12, 0 -; GFX9-NEXT: s_mul_i32 s0, s7, s0 -; GFX9-NEXT: s_add_u32 s11, s1, s0 -; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_i32 s0, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s11 +; GFX9-NEXT: s_addc_u32 s6, s12, 0 +; GFX9-NEXT: s_mul_i32 s0, s11, s0 +; GFX9-NEXT: s_add_u32 s7, s1, s0 +; GFX9-NEXT: s_addc_u32 s6, 0, s6 +; GFX9-NEXT: s_mul_i32 s0, s2, s6 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s7 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s9, s11 +; GFX9-NEXT: s_mul_i32 s1, s3, s7 ; GFX9-NEXT: s_add_i32 s12, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s8, s11 +; GFX9-NEXT: s_mul_i32 s1, s2, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_sub_i32 s0, s7, s12 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 +; GFX9-NEXT: s_sub_i32 s0, s11, s12 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s10, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s13, s0, s9 -; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s8, v0 +; GFX9-NEXT: s_subb_u32 s13, s0, s3 +; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s2, v0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s13, s13, 0 -; GFX9-NEXT: s_cmp_ge_u32 s13, s9 +; GFX9-NEXT: s_cmp_ge_u32 s13, s3 ; GFX9-NEXT: s_cselect_b32 s14, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v1 -; GFX9-NEXT: s_cmp_eq_u32 s13, s9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 +; GFX9-NEXT: s_cmp_eq_u32 s13, s3 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s11, 1 -; GFX9-NEXT: s_addc_u32 s13, s10, 0 -; GFX9-NEXT: s_add_u32 s1, s11, 2 -; GFX9-NEXT: s_addc_u32 s14, s10, 0 +; GFX9-NEXT: s_add_u32 s0, s7, 1 +; GFX9-NEXT: s_addc_u32 s13, s6, 0 +; GFX9-NEXT: s_add_u32 s1, s7, 2 +; GFX9-NEXT: s_addc_u32 s14, s6, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 @@ -2198,48 +2196,48 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: v_mov_b32_e32 v3, s14 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s7, s12 -; GFX9-NEXT: s_cmp_ge_u32 s0, s9 +; GFX9-NEXT: s_subb_u32 s0, s11, s12 +; GFX9-NEXT: s_cmp_ge_u32 s0, s3 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX9-NEXT: s_cmp_eq_u32 s0, s9 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GFX9-NEXT: s_cmp_eq_u32 s0, s3 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_cbranch_execnz .LBB16_3 ; GFX9-NEXT: .LBB16_2: -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_sub_i32 s0, 0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s0, 0, s2 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s0, s2, s0 -; GFX9-NEXT: s_add_i32 s2, s2, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s2 -; GFX9-NEXT: s_mul_i32 s3, s0, s8 -; GFX9-NEXT: s_sub_i32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_sub_i32 s6, s3, s8 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s3, s0 +; GFX9-NEXT: s_add_i32 s3, s3, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s10, s3 +; GFX9-NEXT: s_mul_i32 s4, s0, s2 +; GFX9-NEXT: s_sub_i32 s4, s10, s4 +; GFX9-NEXT: s_add_i32 s3, s0, 1 +; GFX9-NEXT: s_sub_i32 s5, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s0, s3, s0 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s3, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s0, s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB16_4: ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2248,18 +2246,18 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-LABEL: sudiv64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_or_b64 s[2:3], s[6:7], s[8:9] -; GFX1010-NEXT: s_mov_b32 s2, 0 -; GFX1010-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1010-NEXT: s_or_b64 s[4:5], s[10:11], s[2:3] +; GFX1010-NEXT: s_mov_b32 s4, 0 +; GFX1010-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1010-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1010-NEXT: ; %bb.1: -; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX1010-NEXT: s_sub_u32 s3, 0, s8 -; GFX1010-NEXT: s_subb_u32 s10, 0, s9 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX1010-NEXT: s_sub_u32 s5, 0, s2 +; GFX1010-NEXT: s_subb_u32 s6, 0, s3 ; GFX1010-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1010-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2270,114 +2268,114 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX1010-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1010-NEXT: s_mul_i32 s11, s3, s0 -; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s1 -; GFX1010-NEXT: s_mul_i32 s12, s10, s1 -; GFX1010-NEXT: s_add_i32 s11, s13, s11 -; GFX1010-NEXT: s_mul_i32 s14, s3, s1 -; GFX1010-NEXT: s_add_i32 s11, s11, s12 +; GFX1010-NEXT: s_mul_i32 s7, s5, s0 +; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s1 +; GFX1010-NEXT: s_mul_i32 s12, s6, s1 +; GFX1010-NEXT: s_add_i32 s7, s13, s7 +; GFX1010-NEXT: s_mul_i32 s14, s5, s1 +; GFX1010-NEXT: s_add_i32 s7, s7, s12 ; GFX1010-NEXT: s_mul_hi_u32 s13, s1, s14 ; GFX1010-NEXT: s_mul_hi_u32 s15, s0, s14 ; GFX1010-NEXT: s_mul_i32 s12, s0, s14 -; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s11 -; GFX1010-NEXT: s_mul_i32 s1, s1, s11 -; GFX1010-NEXT: s_mul_hi_u32 s16, s0, s11 +; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s7 +; GFX1010-NEXT: s_mul_i32 s1, s1, s7 +; GFX1010-NEXT: s_mul_hi_u32 s16, s0, s7 ; GFX1010-NEXT: s_add_u32 s1, s13, s1 ; GFX1010-NEXT: s_addc_u32 s13, 0, s14 ; GFX1010-NEXT: s_add_u32 s1, s1, s12 -; GFX1010-NEXT: s_mul_i32 s11, s0, s11 +; GFX1010-NEXT: s_mul_i32 s7, s0, s7 ; GFX1010-NEXT: s_addc_u32 s1, s13, s15 ; GFX1010-NEXT: s_addc_u32 s12, s16, 0 -; GFX1010-NEXT: s_add_u32 s1, s1, s11 -; GFX1010-NEXT: s_addc_u32 s11, 0, s12 +; GFX1010-NEXT: s_add_u32 s1, s1, s7 +; GFX1010-NEXT: s_addc_u32 s7, 0, s12 ; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1 ; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1010-NEXT: s_addc_u32 s0, s0, s11 +; GFX1010-NEXT: s_addc_u32 s0, s0, s7 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1010-NEXT: s_mul_i32 s11, s3, s0 -; GFX1010-NEXT: s_mul_hi_u32 s12, s3, s1 -; GFX1010-NEXT: s_mul_i32 s10, s10, s1 -; GFX1010-NEXT: s_add_i32 s11, s12, s11 -; GFX1010-NEXT: s_mul_i32 s3, s3, s1 -; GFX1010-NEXT: s_add_i32 s11, s11, s10 -; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s3 -; GFX1010-NEXT: s_mul_i32 s13, s0, s3 -; GFX1010-NEXT: s_mul_hi_u32 s3, s1, s3 -; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s11 -; GFX1010-NEXT: s_mul_i32 s1, s1, s11 -; GFX1010-NEXT: s_mul_hi_u32 s10, s0, s11 -; GFX1010-NEXT: s_add_u32 s1, s3, s1 -; GFX1010-NEXT: s_addc_u32 s3, 0, s14 +; GFX1010-NEXT: s_mul_i32 s7, s5, s0 +; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s1 +; GFX1010-NEXT: s_mul_i32 s6, s6, s1 +; GFX1010-NEXT: s_add_i32 s7, s12, s7 +; GFX1010-NEXT: s_mul_i32 s5, s5, s1 +; GFX1010-NEXT: s_add_i32 s7, s7, s6 +; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s5 +; GFX1010-NEXT: s_mul_i32 s13, s0, s5 +; GFX1010-NEXT: s_mul_hi_u32 s5, s1, s5 +; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s7 +; GFX1010-NEXT: s_mul_i32 s1, s1, s7 +; GFX1010-NEXT: s_mul_hi_u32 s6, s0, s7 +; GFX1010-NEXT: s_add_u32 s1, s5, s1 +; GFX1010-NEXT: s_addc_u32 s5, 0, s14 ; GFX1010-NEXT: s_add_u32 s1, s1, s13 -; GFX1010-NEXT: s_mul_i32 s11, s0, s11 -; GFX1010-NEXT: s_addc_u32 s1, s3, s12 -; GFX1010-NEXT: s_addc_u32 s3, s10, 0 -; GFX1010-NEXT: s_add_u32 s1, s1, s11 -; GFX1010-NEXT: s_addc_u32 s3, 0, s3 +; GFX1010-NEXT: s_mul_i32 s7, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s5, s12 +; GFX1010-NEXT: s_addc_u32 s5, s6, 0 +; GFX1010-NEXT: s_add_u32 s1, s1, s7 +; GFX1010-NEXT: s_addc_u32 s5, 0, s5 ; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1 ; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1010-NEXT: s_addc_u32 s0, s0, s3 +; GFX1010-NEXT: s_addc_u32 s0, s0, s5 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1010-NEXT: s_mul_i32 s10, s6, s0 -; GFX1010-NEXT: s_mul_hi_u32 s3, s6, s0 -; GFX1010-NEXT: s_mul_hi_u32 s11, s7, s0 -; GFX1010-NEXT: s_mul_i32 s0, s7, s0 -; GFX1010-NEXT: s_mul_hi_u32 s12, s6, s1 -; GFX1010-NEXT: s_mul_hi_u32 s13, s7, s1 -; GFX1010-NEXT: s_mul_i32 s1, s7, s1 -; GFX1010-NEXT: s_add_u32 s10, s12, s10 -; GFX1010-NEXT: s_addc_u32 s3, 0, s3 -; GFX1010-NEXT: s_add_u32 s1, s10, s1 -; GFX1010-NEXT: s_addc_u32 s1, s3, s13 -; GFX1010-NEXT: s_addc_u32 s3, s11, 0 +; GFX1010-NEXT: s_mul_i32 s6, s10, s0 +; GFX1010-NEXT: s_mul_hi_u32 s5, s10, s0 +; GFX1010-NEXT: s_mul_hi_u32 s7, s11, s0 +; GFX1010-NEXT: s_mul_i32 s0, s11, s0 +; GFX1010-NEXT: s_mul_hi_u32 s12, s10, s1 +; GFX1010-NEXT: s_mul_hi_u32 s13, s11, s1 +; GFX1010-NEXT: s_mul_i32 s1, s11, s1 +; GFX1010-NEXT: s_add_u32 s6, s12, s6 +; GFX1010-NEXT: s_addc_u32 s5, 0, s5 +; GFX1010-NEXT: s_add_u32 s1, s6, s1 +; GFX1010-NEXT: s_addc_u32 s1, s5, s13 +; GFX1010-NEXT: s_addc_u32 s5, s7, 0 ; GFX1010-NEXT: s_add_u32 s1, s1, s0 -; GFX1010-NEXT: s_addc_u32 s3, 0, s3 -; GFX1010-NEXT: s_mul_hi_u32 s0, s8, s1 -; GFX1010-NEXT: s_mul_i32 s11, s8, s3 -; GFX1010-NEXT: s_mul_i32 s12, s8, s1 -; GFX1010-NEXT: s_add_i32 s0, s0, s11 -; GFX1010-NEXT: v_sub_co_u32 v0, s11, s6, s12 -; GFX1010-NEXT: s_mul_i32 s10, s9, s1 -; GFX1010-NEXT: s_add_i32 s0, s0, s10 -; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s8 -; GFX1010-NEXT: s_sub_i32 s10, s7, s0 -; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1010-NEXT: s_subb_u32 s10, s10, s9 +; GFX1010-NEXT: s_addc_u32 s5, 0, s5 +; GFX1010-NEXT: s_mul_hi_u32 s0, s2, s1 +; GFX1010-NEXT: s_mul_i32 s7, s2, s5 +; GFX1010-NEXT: s_mul_i32 s12, s2, s1 +; GFX1010-NEXT: s_add_i32 s0, s0, s7 +; GFX1010-NEXT: v_sub_co_u32 v0, s7, s10, s12 +; GFX1010-NEXT: s_mul_i32 s6, s3, s1 +; GFX1010-NEXT: s_add_i32 s0, s0, s6 +; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s2 +; GFX1010-NEXT: s_sub_i32 s6, s11, s0 +; GFX1010-NEXT: s_cmp_lg_u32 s7, 0 +; GFX1010-NEXT: s_subb_u32 s6, s6, s3 ; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 -; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v1 -; GFX1010-NEXT: s_subb_u32 s10, s10, 0 -; GFX1010-NEXT: s_cmp_ge_u32 s10, s9 +; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 +; GFX1010-NEXT: s_subb_u32 s6, s6, 0 +; GFX1010-NEXT: s_cmp_ge_u32 s6, s3 ; GFX1010-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1010-NEXT: s_cmp_eq_u32 s10, s9 +; GFX1010-NEXT: s_cmp_eq_u32 s6, s3 ; GFX1010-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1010-NEXT: s_add_u32 s10, s1, 1 +; GFX1010-NEXT: s_add_u32 s6, s1, 1 ; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1010-NEXT: s_addc_u32 s12, s3, 0 +; GFX1010-NEXT: s_addc_u32 s12, s5, 0 ; GFX1010-NEXT: s_add_u32 s13, s1, 2 -; GFX1010-NEXT: s_addc_u32 s14, s3, 0 -; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v0 -; GFX1010-NEXT: s_subb_u32 s0, s7, s0 +; GFX1010-NEXT: s_addc_u32 s14, s5, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s7, 0 +; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 +; GFX1010-NEXT: s_subb_u32 s0, s11, s0 ; GFX1010-NEXT: v_mov_b32_e32 v2, s13 -; GFX1010-NEXT: s_cmp_ge_u32 s0, s9 +; GFX1010-NEXT: s_cmp_ge_u32 s0, s3 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX1010-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1010-NEXT: s_cmp_eq_u32 s0, s9 +; GFX1010-NEXT: s_cmp_eq_u32 s0, s3 ; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s14 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 -; GFX1010-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo ; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo ; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1010-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc_lo ; GFX1010-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo -; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s2 +; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1010-NEXT: .LBB16_2: -; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX1010-NEXT: s_sub_i32 s1, 0, s8 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1010-NEXT: s_sub_i32 s1, 0, s2 ; GFX1010-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2385,23 +2383,23 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_mul_i32 s1, s1, s0 ; GFX1010-NEXT: s_mul_hi_u32 s1, s0, s1 ; GFX1010-NEXT: s_add_i32 s0, s0, s1 -; GFX1010-NEXT: s_mul_hi_u32 s0, s6, s0 -; GFX1010-NEXT: s_mul_i32 s1, s0, s8 -; GFX1010-NEXT: s_add_i32 s2, s0, 1 -; GFX1010-NEXT: s_sub_i32 s1, s6, s1 -; GFX1010-NEXT: s_sub_i32 s3, s1, s8 -; GFX1010-NEXT: s_cmp_ge_u32 s1, s8 -; GFX1010-NEXT: s_cselect_b32 s0, s2, s0 -; GFX1010-NEXT: s_cselect_b32 s1, s3, s1 -; GFX1010-NEXT: s_add_i32 s2, s0, 1 -; GFX1010-NEXT: s_cmp_ge_u32 s1, s8 +; GFX1010-NEXT: s_mul_hi_u32 s0, s10, s0 +; GFX1010-NEXT: s_mul_i32 s1, s0, s2 +; GFX1010-NEXT: s_add_i32 s3, s0, 1 +; GFX1010-NEXT: s_sub_i32 s1, s10, s1 +; GFX1010-NEXT: s_sub_i32 s4, s1, s2 +; GFX1010-NEXT: s_cmp_ge_u32 s1, s2 +; GFX1010-NEXT: s_cselect_b32 s0, s3, s0 +; GFX1010-NEXT: s_cselect_b32 s1, s4, s1 +; GFX1010-NEXT: s_add_i32 s3, s0, 1 +; GFX1010-NEXT: s_cmp_ge_u32 s1, s2 ; GFX1010-NEXT: s_mov_b32 s1, 0 -; GFX1010-NEXT: s_cselect_b32 s0, s2, s0 +; GFX1010-NEXT: s_cselect_b32 s0, s3, s0 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: .LBB16_3: ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX1010-NEXT: s_endpgm ; GFX1010-NEXT: .LBB16_4: ; GFX1010-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2410,18 +2408,18 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-LABEL: sudiv64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] -; GFX1030W32-NEXT: s_mov_b32 s8, 0 -; GFX1030W32-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W32-NEXT: s_or_b64 s[4:5], s[10:11], s[2:3] +; GFX1030W32-NEXT: s_mov_b32 s4, 0 +; GFX1030W32-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1030W32-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1030W32-NEXT: ; %bb.1: ; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX1030W32-NEXT: s_sub_u32 s9, 0, s2 -; GFX1030W32-NEXT: s_subb_u32 s10, 0, s3 +; GFX1030W32-NEXT: s_sub_u32 s5, 0, s2 +; GFX1030W32-NEXT: s_subb_u32 s6, 0, s3 ; GFX1030W32-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 ; GFX1030W32-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030W32-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2432,95 +2430,95 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX1030W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1030W32-NEXT: s_mul_i32 s11, s9, s0 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s1 -; GFX1030W32-NEXT: s_mul_i32 s12, s10, s1 -; GFX1030W32-NEXT: s_add_i32 s11, s13, s11 -; GFX1030W32-NEXT: s_mul_i32 s14, s9, s1 -; GFX1030W32-NEXT: s_add_i32 s11, s11, s12 +; GFX1030W32-NEXT: s_mul_i32 s7, s5, s0 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s5, s1 +; GFX1030W32-NEXT: s_mul_i32 s12, s6, s1 +; GFX1030W32-NEXT: s_add_i32 s7, s13, s7 +; GFX1030W32-NEXT: s_mul_i32 s14, s5, s1 +; GFX1030W32-NEXT: s_add_i32 s7, s7, s12 ; GFX1030W32-NEXT: s_mul_hi_u32 s13, s1, s14 ; GFX1030W32-NEXT: s_mul_hi_u32 s15, s0, s14 ; GFX1030W32-NEXT: s_mul_i32 s12, s0, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s14, s1, s11 -; GFX1030W32-NEXT: s_mul_i32 s1, s1, s11 -; GFX1030W32-NEXT: s_mul_hi_u32 s16, s0, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s14, s1, s7 +; GFX1030W32-NEXT: s_mul_i32 s1, s1, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s16, s0, s7 ; GFX1030W32-NEXT: s_add_u32 s1, s13, s1 ; GFX1030W32-NEXT: s_addc_u32 s13, 0, s14 ; GFX1030W32-NEXT: s_add_u32 s1, s1, s12 -; GFX1030W32-NEXT: s_mul_i32 s11, s0, s11 +; GFX1030W32-NEXT: s_mul_i32 s7, s0, s7 ; GFX1030W32-NEXT: s_addc_u32 s1, s13, s15 ; GFX1030W32-NEXT: s_addc_u32 s12, s16, 0 -; GFX1030W32-NEXT: s_add_u32 s1, s1, s11 -; GFX1030W32-NEXT: s_addc_u32 s11, 0, s12 +; GFX1030W32-NEXT: s_add_u32 s1, s1, s7 +; GFX1030W32-NEXT: s_addc_u32 s7, 0, s12 ; GFX1030W32-NEXT: v_add_co_u32 v0, s1, v0, s1 ; GFX1030W32-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1030W32-NEXT: s_addc_u32 s0, s0, s11 +; GFX1030W32-NEXT: s_addc_u32 s0, s0, s7 ; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1030W32-NEXT: s_mul_i32 s11, s9, s0 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s9, s1 -; GFX1030W32-NEXT: s_mul_i32 s10, s10, s1 -; GFX1030W32-NEXT: s_add_i32 s11, s12, s11 -; GFX1030W32-NEXT: s_mul_i32 s9, s9, s1 -; GFX1030W32-NEXT: s_add_i32 s11, s11, s10 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s0, s9 -; GFX1030W32-NEXT: s_mul_i32 s13, s0, s9 -; GFX1030W32-NEXT: s_mul_hi_u32 s9, s1, s9 -; GFX1030W32-NEXT: s_mul_hi_u32 s14, s1, s11 -; GFX1030W32-NEXT: s_mul_i32 s1, s1, s11 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s0, s11 -; GFX1030W32-NEXT: s_add_u32 s1, s9, s1 -; GFX1030W32-NEXT: s_addc_u32 s9, 0, s14 +; GFX1030W32-NEXT: s_mul_i32 s7, s5, s0 +; GFX1030W32-NEXT: s_mul_hi_u32 s12, s5, s1 +; GFX1030W32-NEXT: s_mul_i32 s6, s6, s1 +; GFX1030W32-NEXT: s_add_i32 s7, s12, s7 +; GFX1030W32-NEXT: s_mul_i32 s5, s5, s1 +; GFX1030W32-NEXT: s_add_i32 s7, s7, s6 +; GFX1030W32-NEXT: s_mul_hi_u32 s12, s0, s5 +; GFX1030W32-NEXT: s_mul_i32 s13, s0, s5 +; GFX1030W32-NEXT: s_mul_hi_u32 s5, s1, s5 +; GFX1030W32-NEXT: s_mul_hi_u32 s14, s1, s7 +; GFX1030W32-NEXT: s_mul_i32 s1, s1, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s6, s0, s7 +; GFX1030W32-NEXT: s_add_u32 s1, s5, s1 +; GFX1030W32-NEXT: s_addc_u32 s5, 0, s14 ; GFX1030W32-NEXT: s_add_u32 s1, s1, s13 -; GFX1030W32-NEXT: s_mul_i32 s11, s0, s11 -; GFX1030W32-NEXT: s_addc_u32 s1, s9, s12 -; GFX1030W32-NEXT: s_addc_u32 s9, s10, 0 -; GFX1030W32-NEXT: s_add_u32 s1, s1, s11 -; GFX1030W32-NEXT: s_addc_u32 s9, 0, s9 +; GFX1030W32-NEXT: s_mul_i32 s7, s0, s7 +; GFX1030W32-NEXT: s_addc_u32 s1, s5, s12 +; GFX1030W32-NEXT: s_addc_u32 s5, s6, 0 +; GFX1030W32-NEXT: s_add_u32 s1, s1, s7 +; GFX1030W32-NEXT: s_addc_u32 s5, 0, s5 ; GFX1030W32-NEXT: v_add_co_u32 v0, s1, v0, s1 ; GFX1030W32-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1030W32-NEXT: s_addc_u32 s0, s0, s9 +; GFX1030W32-NEXT: s_addc_u32 s0, s0, s5 ; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1030W32-NEXT: s_mul_i32 s10, s6, s0 -; GFX1030W32-NEXT: s_mul_hi_u32 s9, s6, s0 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s0 -; GFX1030W32-NEXT: s_mul_i32 s0, s7, s0 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s6, s1 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s1 -; GFX1030W32-NEXT: s_mul_i32 s1, s7, s1 -; GFX1030W32-NEXT: s_add_u32 s10, s12, s10 -; GFX1030W32-NEXT: s_addc_u32 s9, 0, s9 -; GFX1030W32-NEXT: s_add_u32 s1, s10, s1 -; GFX1030W32-NEXT: s_addc_u32 s1, s9, s13 -; GFX1030W32-NEXT: s_addc_u32 s9, s11, 0 +; GFX1030W32-NEXT: s_mul_i32 s6, s10, s0 +; GFX1030W32-NEXT: s_mul_hi_u32 s5, s10, s0 +; GFX1030W32-NEXT: s_mul_hi_u32 s7, s11, s0 +; GFX1030W32-NEXT: s_mul_i32 s0, s11, s0 +; GFX1030W32-NEXT: s_mul_hi_u32 s12, s10, s1 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s11, s1 +; GFX1030W32-NEXT: s_mul_i32 s1, s11, s1 +; GFX1030W32-NEXT: s_add_u32 s6, s12, s6 +; GFX1030W32-NEXT: s_addc_u32 s5, 0, s5 +; GFX1030W32-NEXT: s_add_u32 s1, s6, s1 +; GFX1030W32-NEXT: s_addc_u32 s1, s5, s13 +; GFX1030W32-NEXT: s_addc_u32 s5, s7, 0 ; GFX1030W32-NEXT: s_add_u32 s1, s1, s0 -; GFX1030W32-NEXT: s_addc_u32 s9, 0, s9 +; GFX1030W32-NEXT: s_addc_u32 s5, 0, s5 ; GFX1030W32-NEXT: s_mul_hi_u32 s0, s2, s1 -; GFX1030W32-NEXT: s_mul_i32 s11, s2, s9 +; GFX1030W32-NEXT: s_mul_i32 s7, s2, s5 ; GFX1030W32-NEXT: s_mul_i32 s12, s2, s1 -; GFX1030W32-NEXT: s_add_i32 s0, s0, s11 -; GFX1030W32-NEXT: v_sub_co_u32 v0, s11, s6, s12 -; GFX1030W32-NEXT: s_mul_i32 s10, s3, s1 -; GFX1030W32-NEXT: s_add_i32 s0, s0, s10 +; GFX1030W32-NEXT: s_add_i32 s0, s0, s7 +; GFX1030W32-NEXT: v_sub_co_u32 v0, s7, s10, s12 +; GFX1030W32-NEXT: s_mul_i32 s6, s3, s1 +; GFX1030W32-NEXT: s_add_i32 s0, s0, s6 ; GFX1030W32-NEXT: v_sub_co_u32 v1, s12, v0, s2 -; GFX1030W32-NEXT: s_sub_i32 s10, s7, s0 -; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1030W32-NEXT: s_subb_u32 s10, s10, s3 +; GFX1030W32-NEXT: s_sub_i32 s6, s11, s0 +; GFX1030W32-NEXT: s_cmp_lg_u32 s7, 0 +; GFX1030W32-NEXT: s_subb_u32 s6, s6, s3 ; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 -; GFX1030W32-NEXT: s_subb_u32 s10, s10, 0 -; GFX1030W32-NEXT: s_cmp_ge_u32 s10, s3 +; GFX1030W32-NEXT: s_subb_u32 s6, s6, 0 +; GFX1030W32-NEXT: s_cmp_ge_u32 s6, s3 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1030W32-NEXT: s_cmp_eq_u32 s10, s3 +; GFX1030W32-NEXT: s_cmp_eq_u32 s6, s3 ; GFX1030W32-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1030W32-NEXT: s_add_u32 s10, s1, 1 +; GFX1030W32-NEXT: s_add_u32 s6, s1, 1 ; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1030W32-NEXT: s_addc_u32 s12, s9, 0 +; GFX1030W32-NEXT: s_addc_u32 s12, s5, 0 ; GFX1030W32-NEXT: s_add_u32 s13, s1, 2 -; GFX1030W32-NEXT: s_addc_u32 s14, s9, 0 -; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1030W32-NEXT: s_addc_u32 s14, s5, 0 +; GFX1030W32-NEXT: s_cmp_lg_u32 s7, 0 ; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 -; GFX1030W32-NEXT: s_subb_u32 s0, s7, s0 +; GFX1030W32-NEXT: s_subb_u32 s0, s11, s0 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, s13 ; GFX1030W32-NEXT: s_cmp_ge_u32 s0, s3 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo @@ -2530,12 +2528,12 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s14 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 -; GFX1030W32-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo +; GFX1030W32-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo ; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo ; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo +; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc_lo ; GFX1030W32-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo -; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 +; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1030W32-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1030W32-NEXT: .LBB16_2: ; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2547,14 +2545,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_mul_i32 s1, s1, s0 ; GFX1030W32-NEXT: s_mul_hi_u32 s1, s0, s1 ; GFX1030W32-NEXT: s_add_i32 s0, s0, s1 -; GFX1030W32-NEXT: s_mul_hi_u32 s0, s6, s0 +; GFX1030W32-NEXT: s_mul_hi_u32 s0, s10, s0 ; GFX1030W32-NEXT: s_mul_i32 s1, s0, s2 ; GFX1030W32-NEXT: s_add_i32 s3, s0, 1 -; GFX1030W32-NEXT: s_sub_i32 s1, s6, s1 -; GFX1030W32-NEXT: s_sub_i32 s6, s1, s2 +; GFX1030W32-NEXT: s_sub_i32 s1, s10, s1 +; GFX1030W32-NEXT: s_sub_i32 s4, s1, s2 ; GFX1030W32-NEXT: s_cmp_ge_u32 s1, s2 ; GFX1030W32-NEXT: s_cselect_b32 s0, s3, s0 -; GFX1030W32-NEXT: s_cselect_b32 s1, s6, s1 +; GFX1030W32-NEXT: s_cselect_b32 s1, s4, s1 ; GFX1030W32-NEXT: s_add_i32 s3, s0, 1 ; GFX1030W32-NEXT: s_cmp_ge_u32 s1, s2 ; GFX1030W32-NEXT: s_mov_b32 s1, 0 @@ -2563,7 +2561,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030W32-NEXT: .LBB16_3: ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX1030W32-NEXT: s_endpgm ; GFX1030W32-NEXT: .LBB16_4: ; GFX1030W32-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2572,18 +2570,18 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-LABEL: sudiv64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] +; GFX1030W64-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] ; GFX1030W64-NEXT: s_mov_b32 s0, 0 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1030W64-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1030W64-NEXT: ; %bb.1: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX1030W64-NEXT: s_sub_u32 s9, 0, s2 -; GFX1030W64-NEXT: s_subb_u32 s10, 0, s3 +; GFX1030W64-NEXT: s_sub_u32 s5, 0, s2 +; GFX1030W64-NEXT: s_subb_u32 s6, 0, s3 ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 ; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2592,111 +2590,111 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1 +; GFX1030W64-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1030W64-NEXT: s_mul_i32 s1, s9, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s0 -; GFX1030W64-NEXT: s_mul_i32 s11, s10, s0 +; GFX1030W64-NEXT: s_mul_i32 s1, s5, s4 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s5, s0 +; GFX1030W64-NEXT: s_mul_i32 s7, s6, s0 ; GFX1030W64-NEXT: s_add_i32 s1, s12, s1 -; GFX1030W64-NEXT: s_mul_i32 s13, s9, s0 -; GFX1030W64-NEXT: s_add_i32 s1, s1, s11 +; GFX1030W64-NEXT: s_mul_i32 s13, s5, s0 +; GFX1030W64-NEXT: s_add_i32 s1, s1, s7 ; GFX1030W64-NEXT: s_mul_hi_u32 s12, s0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13 -; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13 +; GFX1030W64-NEXT: s_mul_hi_u32 s14, s4, s13 +; GFX1030W64-NEXT: s_mul_i32 s7, s4, s13 ; GFX1030W64-NEXT: s_mul_hi_u32 s13, s0, s1 ; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1 -; GFX1030W64-NEXT: s_mul_hi_u32 s15, s8, s1 +; GFX1030W64-NEXT: s_mul_hi_u32 s15, s4, s1 ; GFX1030W64-NEXT: s_add_u32 s0, s12, s0 ; GFX1030W64-NEXT: s_addc_u32 s12, 0, s13 -; GFX1030W64-NEXT: s_add_u32 s0, s0, s11 -; GFX1030W64-NEXT: s_mul_i32 s1, s8, s1 +; GFX1030W64-NEXT: s_add_u32 s0, s0, s7 +; GFX1030W64-NEXT: s_mul_i32 s1, s4, s1 ; GFX1030W64-NEXT: s_addc_u32 s0, s12, s14 -; GFX1030W64-NEXT: s_addc_u32 s11, s15, 0 +; GFX1030W64-NEXT: s_addc_u32 s7, s15, 0 ; GFX1030W64-NEXT: s_add_u32 s0, s0, s1 -; GFX1030W64-NEXT: s_addc_u32 s11, 0, s11 +; GFX1030W64-NEXT: s_addc_u32 s7, 0, s7 ; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11 +; GFX1030W64-NEXT: s_addc_u32 s4, s4, s7 ; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1030W64-NEXT: s_mul_i32 s1, s9, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s9, s0 -; GFX1030W64-NEXT: s_mul_i32 s10, s10, s0 -; GFX1030W64-NEXT: s_add_i32 s1, s11, s1 -; GFX1030W64-NEXT: s_mul_i32 s9, s9, s0 -; GFX1030W64-NEXT: s_add_i32 s1, s1, s10 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s9 -; GFX1030W64-NEXT: s_mul_i32 s12, s8, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s9, s0, s9 +; GFX1030W64-NEXT: s_mul_i32 s1, s5, s4 +; GFX1030W64-NEXT: s_mul_hi_u32 s7, s5, s0 +; GFX1030W64-NEXT: s_mul_i32 s6, s6, s0 +; GFX1030W64-NEXT: s_add_i32 s1, s7, s1 +; GFX1030W64-NEXT: s_mul_i32 s5, s5, s0 +; GFX1030W64-NEXT: s_add_i32 s1, s1, s6 +; GFX1030W64-NEXT: s_mul_hi_u32 s7, s4, s5 +; GFX1030W64-NEXT: s_mul_i32 s12, s4, s5 +; GFX1030W64-NEXT: s_mul_hi_u32 s5, s0, s5 ; GFX1030W64-NEXT: s_mul_hi_u32 s13, s0, s1 ; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s1 -; GFX1030W64-NEXT: s_add_u32 s0, s9, s0 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s13 +; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s1 +; GFX1030W64-NEXT: s_add_u32 s0, s5, s0 +; GFX1030W64-NEXT: s_addc_u32 s5, 0, s13 ; GFX1030W64-NEXT: s_add_u32 s0, s0, s12 -; GFX1030W64-NEXT: s_mul_i32 s1, s8, s1 -; GFX1030W64-NEXT: s_addc_u32 s0, s9, s11 -; GFX1030W64-NEXT: s_addc_u32 s9, s10, 0 +; GFX1030W64-NEXT: s_mul_i32 s1, s4, s1 +; GFX1030W64-NEXT: s_addc_u32 s0, s5, s7 +; GFX1030W64-NEXT: s_addc_u32 s5, s6, 0 ; GFX1030W64-NEXT: s_add_u32 s0, s0, s1 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9 +; GFX1030W64-NEXT: s_addc_u32 s5, 0, s5 ; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1030W64-NEXT: s_addc_u32 s0, s8, s9 +; GFX1030W64-NEXT: s_addc_u32 s0, s4, s5 ; GFX1030W64-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1030W64-NEXT: s_mul_i32 s9, s6, s0 -; GFX1030W64-NEXT: s_mul_hi_u32 s8, s6, s0 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s7, s0 -; GFX1030W64-NEXT: s_mul_i32 s0, s7, s0 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s1 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s1 -; GFX1030W64-NEXT: s_mul_i32 s1, s7, s1 -; GFX1030W64-NEXT: s_add_u32 s9, s11, s9 -; GFX1030W64-NEXT: s_addc_u32 s8, 0, s8 -; GFX1030W64-NEXT: s_add_u32 s1, s9, s1 -; GFX1030W64-NEXT: s_addc_u32 s1, s8, s12 -; GFX1030W64-NEXT: s_addc_u32 s8, s10, 0 -; GFX1030W64-NEXT: s_add_u32 s10, s1, s0 -; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s0, s2, s10 -; GFX1030W64-NEXT: s_mul_i32 s1, s2, s11 -; GFX1030W64-NEXT: s_mul_i32 s9, s2, s10 +; GFX1030W64-NEXT: s_mul_i32 s5, s10, s0 +; GFX1030W64-NEXT: s_mul_hi_u32 s4, s10, s0 +; GFX1030W64-NEXT: s_mul_hi_u32 s6, s11, s0 +; GFX1030W64-NEXT: s_mul_i32 s0, s11, s0 +; GFX1030W64-NEXT: s_mul_hi_u32 s7, s10, s1 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s11, s1 +; GFX1030W64-NEXT: s_mul_i32 s1, s11, s1 +; GFX1030W64-NEXT: s_add_u32 s5, s7, s5 +; GFX1030W64-NEXT: s_addc_u32 s4, 0, s4 +; GFX1030W64-NEXT: s_add_u32 s1, s5, s1 +; GFX1030W64-NEXT: s_addc_u32 s1, s4, s12 +; GFX1030W64-NEXT: s_addc_u32 s4, s6, 0 +; GFX1030W64-NEXT: s_add_u32 s6, s1, s0 +; GFX1030W64-NEXT: s_addc_u32 s7, 0, s4 +; GFX1030W64-NEXT: s_mul_hi_u32 s0, s2, s6 +; GFX1030W64-NEXT: s_mul_i32 s1, s2, s7 +; GFX1030W64-NEXT: s_mul_i32 s5, s2, s6 ; GFX1030W64-NEXT: s_add_i32 s12, s0, s1 -; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s6, s9 -; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10 -; GFX1030W64-NEXT: s_add_i32 s12, s12, s8 -; GFX1030W64-NEXT: v_sub_co_u32 v1, s[8:9], v0, s2 -; GFX1030W64-NEXT: s_sub_i32 s13, s7, s12 +; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s10, s5 +; GFX1030W64-NEXT: s_mul_i32 s4, s3, s6 +; GFX1030W64-NEXT: s_add_i32 s12, s12, s4 +; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], v0, s2 +; GFX1030W64-NEXT: s_sub_i32 s13, s11, s12 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1030W64-NEXT: s_subb_u32 s13, s13, s3 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 -; GFX1030W64-NEXT: s_subb_u32 s8, s13, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s3 +; GFX1030W64-NEXT: s_subb_u32 s4, s13, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s4, s3 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s3 +; GFX1030W64-NEXT: s_cselect_b32 s5, -1, 0 +; GFX1030W64-NEXT: s_cmp_eq_u32 s4, s3 ; GFX1030W64-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX1030W64-NEXT: s_add_u32 s8, s10, 1 -; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc -; GFX1030W64-NEXT: s_addc_u32 s9, s11, 0 -; GFX1030W64-NEXT: s_add_u32 s13, s10, 2 -; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0 +; GFX1030W64-NEXT: s_add_u32 s4, s6, 1 +; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc +; GFX1030W64-NEXT: s_addc_u32 s5, s7, 0 +; GFX1030W64-NEXT: s_add_u32 s13, s6, 2 +; GFX1030W64-NEXT: s_addc_u32 s14, s7, 0 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX1030W64-NEXT: s_subb_u32 s0, s7, s12 +; GFX1030W64-NEXT: s_subb_u32 s0, s11, s12 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, s13 ; GFX1030W64-NEXT: s_cmp_ge_u32 s0, s3 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1030W64-NEXT: s_cselect_b32 s11, -1, 0 ; GFX1030W64-NEXT: s_cmp_eq_u32 s0, s3 ; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX1030W64-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s14 -; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, s7, v0, s[0:1] -; GFX1030W64-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc -; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc +; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, s11, v0, s[0:1] +; GFX1030W64-NEXT: v_cndmask_b32_e32 v2, s4, v2, vcc +; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc ; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc -; GFX1030W64-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc +; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s7, v1, vcc +; GFX1030W64-NEXT: v_cndmask_b32_e32 v0, s6, v2, vcc ; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3 ; GFX1030W64-NEXT: .LBB16_2: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2708,14 +2706,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: s_mul_i32 s1, s1, s0 ; GFX1030W64-NEXT: s_mul_hi_u32 s1, s0, s1 ; GFX1030W64-NEXT: s_add_i32 s0, s0, s1 -; GFX1030W64-NEXT: s_mul_hi_u32 s0, s6, s0 +; GFX1030W64-NEXT: s_mul_hi_u32 s0, s10, s0 ; GFX1030W64-NEXT: s_mul_i32 s1, s0, s2 ; GFX1030W64-NEXT: s_add_i32 s3, s0, 1 -; GFX1030W64-NEXT: s_sub_i32 s1, s6, s1 -; GFX1030W64-NEXT: s_sub_i32 s6, s1, s2 +; GFX1030W64-NEXT: s_sub_i32 s1, s10, s1 +; GFX1030W64-NEXT: s_sub_i32 s4, s1, s2 ; GFX1030W64-NEXT: s_cmp_ge_u32 s1, s2 ; GFX1030W64-NEXT: s_cselect_b32 s0, s3, s0 -; GFX1030W64-NEXT: s_cselect_b32 s1, s6, s1 +; GFX1030W64-NEXT: s_cselect_b32 s1, s4, s1 ; GFX1030W64-NEXT: s_add_i32 s3, s0, 1 ; GFX1030W64-NEXT: s_cmp_ge_u32 s1, s2 ; GFX1030W64-NEXT: s_mov_b32 s1, 0 @@ -2724,7 +2722,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030W64-NEXT: .LBB16_3: ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX1030W64-NEXT: s_endpgm ; GFX1030W64-NEXT: .LBB16_4: ; GFX1030W64-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2733,19 +2731,19 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-LABEL: sudiv64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] -; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_or_b64 s[4:5], s[10:11], s[2:3] +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX11-NEXT: s_sub_u32 s9, 0, s2 -; GFX11-NEXT: s_subb_u32 s10, 0, s3 +; GFX11-NEXT: s_sub_u32 s5, 0, s2 +; GFX11-NEXT: s_subb_u32 s6, 0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 ; GFX11-NEXT: v_rcp_f32_e32 v0, v0 @@ -2762,101 +2760,101 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_mul_i32 s11, s9, s0 -; GFX11-NEXT: s_mul_hi_u32 s13, s9, s1 -; GFX11-NEXT: s_mul_i32 s12, s10, s1 -; GFX11-NEXT: s_add_i32 s11, s13, s11 -; GFX11-NEXT: s_mul_i32 s14, s9, s1 -; GFX11-NEXT: s_add_i32 s11, s11, s12 +; GFX11-NEXT: s_mul_i32 s7, s5, s0 +; GFX11-NEXT: s_mul_hi_u32 s13, s5, s1 +; GFX11-NEXT: s_mul_i32 s12, s6, s1 +; GFX11-NEXT: s_add_i32 s7, s13, s7 +; GFX11-NEXT: s_mul_i32 s14, s5, s1 +; GFX11-NEXT: s_add_i32 s7, s7, s12 ; GFX11-NEXT: s_mul_hi_u32 s13, s1, s14 ; GFX11-NEXT: s_mul_hi_u32 s15, s0, s14 ; GFX11-NEXT: s_mul_i32 s12, s0, s14 -; GFX11-NEXT: s_mul_hi_u32 s14, s1, s11 -; GFX11-NEXT: s_mul_i32 s1, s1, s11 -; GFX11-NEXT: s_mul_hi_u32 s16, s0, s11 +; GFX11-NEXT: s_mul_hi_u32 s14, s1, s7 +; GFX11-NEXT: s_mul_i32 s1, s1, s7 +; GFX11-NEXT: s_mul_hi_u32 s16, s0, s7 ; GFX11-NEXT: s_add_u32 s1, s13, s1 ; GFX11-NEXT: s_addc_u32 s13, 0, s14 ; GFX11-NEXT: s_add_u32 s1, s1, s12 -; GFX11-NEXT: s_mul_i32 s11, s0, s11 +; GFX11-NEXT: s_mul_i32 s7, s0, s7 ; GFX11-NEXT: s_addc_u32 s1, s13, s15 ; GFX11-NEXT: s_addc_u32 s12, s16, 0 -; GFX11-NEXT: s_add_u32 s1, s1, s11 -; GFX11-NEXT: s_addc_u32 s11, 0, s12 +; GFX11-NEXT: s_add_u32 s1, s1, s7 +; GFX11-NEXT: s_addc_u32 s7, 0, s12 ; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_addc_u32 s0, s0, s11 +; GFX11-NEXT: s_addc_u32 s0, s0, s7 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: s_mul_i32 s11, s9, s0 +; GFX11-NEXT: s_mul_i32 s7, s5, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_mul_hi_u32 s12, s9, s1 -; GFX11-NEXT: s_mul_i32 s10, s10, s1 -; GFX11-NEXT: s_add_i32 s11, s12, s11 -; GFX11-NEXT: s_mul_i32 s9, s9, s1 -; GFX11-NEXT: s_add_i32 s11, s11, s10 -; GFX11-NEXT: s_mul_hi_u32 s12, s0, s9 -; GFX11-NEXT: s_mul_i32 s13, s0, s9 -; GFX11-NEXT: s_mul_hi_u32 s9, s1, s9 -; GFX11-NEXT: s_mul_hi_u32 s14, s1, s11 -; GFX11-NEXT: s_mul_i32 s1, s1, s11 -; GFX11-NEXT: s_mul_hi_u32 s10, s0, s11 -; GFX11-NEXT: s_add_u32 s1, s9, s1 -; GFX11-NEXT: s_addc_u32 s9, 0, s14 +; GFX11-NEXT: s_mul_hi_u32 s12, s5, s1 +; GFX11-NEXT: s_mul_i32 s6, s6, s1 +; GFX11-NEXT: s_add_i32 s7, s12, s7 +; GFX11-NEXT: s_mul_i32 s5, s5, s1 +; GFX11-NEXT: s_add_i32 s7, s7, s6 +; GFX11-NEXT: s_mul_hi_u32 s12, s0, s5 +; GFX11-NEXT: s_mul_i32 s13, s0, s5 +; GFX11-NEXT: s_mul_hi_u32 s5, s1, s5 +; GFX11-NEXT: s_mul_hi_u32 s14, s1, s7 +; GFX11-NEXT: s_mul_i32 s1, s1, s7 +; GFX11-NEXT: s_mul_hi_u32 s6, s0, s7 +; GFX11-NEXT: s_add_u32 s1, s5, s1 +; GFX11-NEXT: s_addc_u32 s5, 0, s14 ; GFX11-NEXT: s_add_u32 s1, s1, s13 -; GFX11-NEXT: s_mul_i32 s11, s0, s11 -; GFX11-NEXT: s_addc_u32 s1, s9, s12 -; GFX11-NEXT: s_addc_u32 s9, s10, 0 -; GFX11-NEXT: s_add_u32 s1, s1, s11 -; GFX11-NEXT: s_addc_u32 s9, 0, s9 +; GFX11-NEXT: s_mul_i32 s7, s0, s7 +; GFX11-NEXT: s_addc_u32 s1, s5, s12 +; GFX11-NEXT: s_addc_u32 s5, s6, 0 +; GFX11-NEXT: s_add_u32 s1, s1, s7 +; GFX11-NEXT: s_addc_u32 s5, 0, s5 ; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_addc_u32 s0, s0, s9 +; GFX11-NEXT: s_addc_u32 s0, s0, s5 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: s_mul_i32 s10, s6, s0 -; GFX11-NEXT: s_mul_hi_u32 s9, s6, s0 -; GFX11-NEXT: s_mul_hi_u32 s11, s7, s0 -; GFX11-NEXT: s_mul_i32 s0, s7, s0 -; GFX11-NEXT: s_mul_hi_u32 s12, s6, s1 -; GFX11-NEXT: s_mul_hi_u32 s13, s7, s1 -; GFX11-NEXT: s_mul_i32 s1, s7, s1 -; GFX11-NEXT: s_add_u32 s10, s12, s10 -; GFX11-NEXT: s_addc_u32 s9, 0, s9 -; GFX11-NEXT: s_add_u32 s1, s10, s1 -; GFX11-NEXT: s_addc_u32 s1, s9, s13 -; GFX11-NEXT: s_addc_u32 s9, s11, 0 +; GFX11-NEXT: s_mul_i32 s6, s10, s0 +; GFX11-NEXT: s_mul_hi_u32 s5, s10, s0 +; GFX11-NEXT: s_mul_hi_u32 s7, s11, s0 +; GFX11-NEXT: s_mul_i32 s0, s11, s0 +; GFX11-NEXT: s_mul_hi_u32 s12, s10, s1 +; GFX11-NEXT: s_mul_hi_u32 s13, s11, s1 +; GFX11-NEXT: s_mul_i32 s1, s11, s1 +; GFX11-NEXT: s_add_u32 s6, s12, s6 +; GFX11-NEXT: s_addc_u32 s5, 0, s5 +; GFX11-NEXT: s_add_u32 s1, s6, s1 +; GFX11-NEXT: s_addc_u32 s1, s5, s13 +; GFX11-NEXT: s_addc_u32 s5, s7, 0 ; GFX11-NEXT: s_add_u32 s1, s1, s0 -; GFX11-NEXT: s_addc_u32 s9, 0, s9 +; GFX11-NEXT: s_addc_u32 s5, 0, s5 ; GFX11-NEXT: s_mul_hi_u32 s0, s2, s1 -; GFX11-NEXT: s_mul_i32 s11, s2, s9 +; GFX11-NEXT: s_mul_i32 s7, s2, s5 ; GFX11-NEXT: s_mul_i32 s12, s2, s1 -; GFX11-NEXT: s_add_i32 s0, s0, s11 -; GFX11-NEXT: v_sub_co_u32 v0, s11, s6, s12 -; GFX11-NEXT: s_mul_i32 s10, s3, s1 +; GFX11-NEXT: s_add_i32 s0, s0, s7 +; GFX11-NEXT: v_sub_co_u32 v0, s7, s10, s12 +; GFX11-NEXT: s_mul_i32 s6, s3, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_add_i32 s0, s0, s10 +; GFX11-NEXT: s_add_i32 s0, s0, s6 ; GFX11-NEXT: v_sub_co_u32 v1, s12, v0, s2 -; GFX11-NEXT: s_sub_i32 s10, s7, s0 -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 -; GFX11-NEXT: s_subb_u32 s10, s10, s3 +; GFX11-NEXT: s_sub_i32 s6, s11, s0 +; GFX11-NEXT: s_cmp_lg_u32 s7, 0 +; GFX11-NEXT: s_subb_u32 s6, s6, s3 ; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 -; GFX11-NEXT: s_subb_u32 s10, s10, 0 +; GFX11-NEXT: s_subb_u32 s6, s6, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_ge_u32 s10, s3 +; GFX11-NEXT: s_cmp_ge_u32 s6, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: s_cselect_b32 s12, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s10, s3 +; GFX11-NEXT: s_cmp_eq_u32 s6, s3 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_add_u32 s10, s1, 1 +; GFX11-NEXT: s_add_u32 s6, s1, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX11-NEXT: s_addc_u32 s12, s9, 0 +; GFX11-NEXT: s_addc_u32 s12, s5, 0 ; GFX11-NEXT: s_add_u32 s13, s1, 2 -; GFX11-NEXT: s_addc_u32 s14, s9, 0 +; GFX11-NEXT: s_addc_u32 s14, s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v2, s13 -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_cmp_lg_u32 s7, 0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 -; GFX11-NEXT: s_subb_u32 s0, s7, s0 +; GFX11-NEXT: s_subb_u32 s0, s11, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_ge_u32 s0, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo @@ -2866,14 +2864,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, s14 ; GFX11-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX11-NEXT: .LBB16_2: ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2890,15 +2888,15 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_mul_hi_u32 s1, s0, s1 ; GFX11-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s0, s6, s0 +; GFX11-NEXT: s_mul_hi_u32 s0, s10, s0 ; GFX11-NEXT: s_mul_i32 s1, s0, s2 ; GFX11-NEXT: s_add_i32 s3, s0, 1 -; GFX11-NEXT: s_sub_i32 s1, s6, s1 +; GFX11-NEXT: s_sub_i32 s1, s10, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s6, s1, s2 +; GFX11-NEXT: s_sub_i32 s4, s1, s2 ; GFX11-NEXT: s_cmp_ge_u32 s1, s2 ; GFX11-NEXT: s_cselect_b32 s0, s3, s0 -; GFX11-NEXT: s_cselect_b32 s1, s6, s1 +; GFX11-NEXT: s_cselect_b32 s1, s4, s1 ; GFX11-NEXT: s_add_i32 s3, s0, 1 ; GFX11-NEXT: s_cmp_ge_u32 s1, s2 ; GFX11-NEXT: s_mov_b32 s1, 0 @@ -2907,7 +2905,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: .LBB16_3: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[8:9] ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB16_4: ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll index 8a39a52cd25eab..c511f88aeaf86c 100644 --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -27,7 +27,7 @@ entry: define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_stack: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s17 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -36,7 +36,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; ; GFX900-LABEL: test_kern_stack: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 s0, s0, s17 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; GFX1010-LABEL: test_kern_stack: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s17 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 @@ -67,76 +67,80 @@ entry: define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s17 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX803-NEXT: s_mov_b32 s13, s15 +; GFX803-NEXT: s_mov_b32 s12, s14 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX803-NEXT: s_mov_b32 s14, s16 ; GFX803-NEXT: s_mov_b32 s32, 0 -; GFX803-NEXT: s_getpc_b64 s[16:17] -; GFX803-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 -; GFX803-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_kern_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s17 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_getpc_b64 s[16:17] -; GFX900-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 -; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_kern_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s10, s10, s15 +; GFX1010-NEXT: s_add_u32 s12, s12, s17 ; GFX1010-NEXT: s_mov_b32 s32, 0 -; GFX1010-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s17 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX1010-NEXT: s_mov_b32 s13, s15 +; GFX1010-NEXT: s_mov_b32 s12, s14 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1010-NEXT: s_getpc_b64 s[16:17] -; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1010-NEXT: s_mov_b32 s14, s16 +; GFX1010-NEXT: s_getpc_b64 s[18:19] +; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm ; ; GFX1100-LABEL: test_kern_call: ; GFX1100: ; %bb.0: ; %entry ; GFX1100-NEXT: v_mov_b32_e32 v31, v0 ; GFX1100-NEXT: s_mov_b32 s12, s13 -; GFX1100-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1100-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1100-NEXT: s_mov_b32 s13, s14 ; GFX1100-NEXT: s_mov_b32 s14, s15 ; GFX1100-NEXT: s_mov_b32 s32, 0 -; GFX1100-NEXT: s_getpc_b64 s[6:7] -; GFX1100-NEXT: s_add_u32 s6, s6, ex@rel32@lo+4 -; GFX1100-NEXT: s_addc_u32 s7, s7, ex@rel32@hi+12 -; GFX1100-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1100-NEXT: s_getpc_b64 s[16:17] +; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 +; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1100-NEXT: s_endpgm entry: @@ -147,69 +151,72 @@ entry: define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s17 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX803-NEXT: s_mov_b32 s13, s15 +; GFX803-NEXT: s_mov_b32 s12, s14 ; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX803-NEXT: s_mov_b32 s14, s16 ; GFX803-NEXT: s_movk_i32 s32, 0x400 ; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: s_getpc_b64 s[16:17] -; GFX803-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 -; GFX803-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s17 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_movk_i32 s32, 0x400 ; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_getpc_b64 s[16:17] -; GFX900-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 -; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_kern_stack_and_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s10, s10, s15 +; GFX1010-NEXT: s_add_u32 s12, s12, s17 ; GFX1010-NEXT: s_movk_i32 s32, 0x200 -; GFX1010-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1010-NEXT: v_mov_b32_e32 v3, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s17 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX1010-NEXT: s_mov_b32 s13, s15 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX1010-NEXT: s_mov_b32 s12, s14 +; GFX1010-NEXT: s_mov_b32 s14, s16 ; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1010-NEXT: s_getpc_b64 s[16:17] -; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1010-NEXT: s_getpc_b64 s[18:19] +; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm ; ; GFX1100-LABEL: test_kern_stack_and_call: @@ -217,18 +224,19 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-NEXT: v_mov_b32_e32 v31, v0 ; GFX1100-NEXT: s_mov_b32 s12, s13 -; GFX1100-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1100-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1100-NEXT: s_mov_b32 s13, s14 ; GFX1100-NEXT: s_mov_b32 s14, s15 ; GFX1100-NEXT: s_mov_b32 s32, 16 ; GFX1100-NEXT: scratch_store_b32 off, v1, off dlc ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: s_getpc_b64 s[6:7] -; GFX1100-NEXT: s_add_u32 s6, s6, ex@rel32@lo+4 -; GFX1100-NEXT: s_addc_u32 s7, s7, ex@rel32@hi+12 -; GFX1100-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1100-NEXT: s_getpc_b64 s[16:17] +; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 +; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1100-NEXT: s_endpgm entry: @@ -266,7 +274,7 @@ entry: define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_stack: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s17 ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 @@ -276,7 +284,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; ; GFX900-LABEL: test_force_fp_kern_stack: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 s0, s0, s17 ; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 @@ -287,7 +295,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX1010-LABEL: test_force_fp_kern_stack: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s17 ; GFX1010-NEXT: s_mov_b32 s33, 0 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 @@ -310,80 +318,84 @@ entry: define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s17 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX803-NEXT: s_mov_b32 s13, s15 +; GFX803-NEXT: s_mov_b32 s12, s14 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX803-NEXT: s_mov_b32 s14, s16 ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_mov_b32 s32, 0 -; GFX803-NEXT: s_getpc_b64 s[16:17] -; GFX803-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 -; GFX803-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_force_fp_kern_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s17 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_getpc_b64 s[16:17] -; GFX900-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 -; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_force_fp_kern_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s10, s10, s15 +; GFX1010-NEXT: s_add_u32 s12, s12, s17 ; GFX1010-NEXT: s_mov_b32 s33, 0 ; GFX1010-NEXT: s_mov_b32 s32, 0 -; GFX1010-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s17 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX1010-NEXT: s_mov_b32 s13, s15 +; GFX1010-NEXT: s_mov_b32 s12, s14 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1010-NEXT: s_getpc_b64 s[16:17] -; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1010-NEXT: s_mov_b32 s14, s16 +; GFX1010-NEXT: s_getpc_b64 s[18:19] +; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm ; ; GFX1100-LABEL: test_force_fp_kern_call: ; GFX1100: ; %bb.0: ; %entry ; GFX1100-NEXT: v_mov_b32_e32 v31, v0 ; GFX1100-NEXT: s_mov_b32 s12, s13 -; GFX1100-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1100-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1100-NEXT: s_mov_b32 s13, s14 ; GFX1100-NEXT: s_mov_b32 s14, s15 ; GFX1100-NEXT: s_mov_b32 s33, 0 ; GFX1100-NEXT: s_mov_b32 s32, 0 -; GFX1100-NEXT: s_getpc_b64 s[6:7] -; GFX1100-NEXT: s_add_u32 s6, s6, ex@rel32@lo+4 -; GFX1100-NEXT: s_addc_u32 s7, s7, ex@rel32@hi+12 -; GFX1100-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1100-NEXT: s_getpc_b64 s[16:17] +; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 +; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1100-NEXT: s_endpgm ; GFX1010-NEXT s_add_u32 s12, s12, s17 ; GFX1010-NEXT s_mov_b32 s33, 0 @@ -412,72 +424,75 @@ entry: define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_i32 s10, s10, s15 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s17 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: s_mov_b32 s33, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX803-NEXT: s_mov_b32 s13, s15 +; GFX803-NEXT: s_mov_b32 s12, s14 ; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX803-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX803-NEXT: s_mov_b32 s14, s16 ; GFX803-NEXT: s_movk_i32 s32, 0x400 ; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: s_getpc_b64 s[16:17] -; GFX803-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 -; GFX803-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_force_fp_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s17 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX900-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_movk_i32 s32, 0x400 ; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_getpc_b64 s[16:17] -; GFX900-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 -; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_force_fp_kern_stack_and_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s10, s10, s15 +; GFX1010-NEXT: s_add_u32 s12, s12, s17 ; GFX1010-NEXT: s_mov_b32 s33, 0 ; GFX1010-NEXT: s_movk_i32 s32, 0x200 -; GFX1010-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1010-NEXT: v_mov_b32_e32 v3, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s17 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX1010-NEXT: s_mov_b32 s13, s15 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1010-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX1010-NEXT: s_mov_b32 s12, s14 +; GFX1010-NEXT: s_mov_b32 s14, s16 ; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1010-NEXT: s_getpc_b64 s[16:17] -; GFX1010-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1010-NEXT: s_getpc_b64 s[18:19] +; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm ; ; GFX1100-LABEL: test_force_fp_kern_stack_and_call: @@ -486,18 +501,19 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX1100-NEXT: v_mov_b32_e32 v31, v0 ; GFX1100-NEXT: s_mov_b32 s33, 0 ; GFX1100-NEXT: s_mov_b32 s12, s13 -; GFX1100-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1100-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1100-NEXT: s_mov_b32 s13, s14 ; GFX1100-NEXT: s_mov_b32 s14, s15 ; GFX1100-NEXT: s_mov_b32 s32, 16 ; GFX1100-NEXT: scratch_store_b32 off, v1, s33 dlc ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: s_getpc_b64 s[6:7] -; GFX1100-NEXT: s_add_u32 s6, s6, ex@rel32@lo+4 -; GFX1100-NEXT: s_addc_u32 s7, s7, ex@rel32@hi+12 -; GFX1100-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1100-NEXT: s_getpc_b64 s[16:17] +; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 +; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1100-NEXT: s_endpgm entry: %x = alloca i32, align 4, addrspace(5) @@ -509,7 +525,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; GFX803-LABEL: test_sgpr_offset_kernel: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s17 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -525,7 +541,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; ; GFX900-LABEL: test_sgpr_offset_kernel: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 s0, s0, s17 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -541,7 +557,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; ; GFX1010-LABEL: test_sgpr_offset_kernel: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s17 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b32 s4, 0x20000 ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index 3e25904aa044dd..fa4e82da1d18e7 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -5,12 +5,12 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: test_loop: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0xa +; GCN-NEXT: s_load_dword s0, s[4:5], 0xa ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s0, -1 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_addk_i32 s0, 0x80 ; GCN-NEXT: s_and_b64 vcc, exec, -1 @@ -34,31 +34,31 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN_DBG-NEXT: s_mov_b32 s14, -1 ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 +; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 -; GCN_DBG-NEXT: s_load_dword s1, s[2:3], 0xa +; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s2, -1 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 -; GCN_DBG-NEXT: s_mov_b64 s[4:5], exec +; GCN_DBG-NEXT: s_mov_b64 s[6:7], exec ; GCN_DBG-NEXT: s_mov_b64 exec, -1 ; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB0_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) ; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 @@ -81,9 +81,9 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock ; GCN_DBG-NEXT: s_endpgm @@ -108,7 +108,7 @@ for.body: define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_const_true: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_addk_i32 s0, 0x80 ; GCN-NEXT: s_and_b64 vcc, exec, -1 @@ -132,26 +132,26 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN_DBG-NEXT: s_mov_b32 s14, -1 ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 +; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB1_2 ; GCN_DBG-NEXT: .LBB1_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB1_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) ; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 @@ -174,9 +174,9 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 ; GCN_DBG-NEXT: s_branch .LBB1_2 entry: @@ -199,7 +199,7 @@ for.body: define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_const_false: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 m0, -1 @@ -215,26 +215,26 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN_DBG-NEXT: s_mov_b32 s14, -1 ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 +; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB2_2 ; GCN_DBG-NEXT: .LBB2_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB2_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) ; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 @@ -257,9 +257,9 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 ; GCN_DBG-NEXT: s_branch .LBB2_2 entry: @@ -283,7 +283,7 @@ for.body: define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_const_undef: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 m0, -1 @@ -299,26 +299,26 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN_DBG-NEXT: s_mov_b32 s14, -1 ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 +; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB3_2 ; GCN_DBG-NEXT: .LBB3_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB3_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: s_waitcnt expcnt(0) ; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 @@ -339,9 +339,9 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN_DBG-NEXT: s_branch .LBB3_2 entry: @@ -368,7 +368,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v0, v0 -; GCN-NEXT: s_load_dword s4, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_bitcmp1_b32 s0, 0 @@ -395,9 +395,9 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN_DBG-NEXT: s_mov_b32 s14, -1 ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s12, s12, s9 +; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll index b23249570faa7d..b44f8ca87fd8be 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s @@ -25,7 +24,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(ptr add ; ; GCN-LABEL: test_sink_small_offset_global_atomic_csub_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll index 7587b81e9936da..97f50ead7de627 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -26,7 +26,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add ; ; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index 6505e390355a8c..f6611c6160fd15 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -119,7 +119,7 @@ ret: ; GCN-LABEL: {{^}}sink_ubfe_i16: ; GCN-NOT: lshr -; VI: s_load_dword [[ARG:s[0-9]+]], s[2:3], 0x2c +; VI: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x2c ; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004 ; GCN: s_cbranch_scc{{[0-1]}} diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 78f61ad906ce3c..21a2ae80574e0f 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -425,9 +425,9 @@ bb: define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) #0 { ; GFX900-LABEL: vload2_private: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 s0, s0, s17 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] @@ -456,10 +456,10 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; FLATSCR-LABEL: vload2_private: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: s_mov_b32 s4, 0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] @@ -489,9 +489,9 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; GFX10_DEFAULT-LABEL: vload2_private: ; GFX10_DEFAULT: ; %bb.0: ; %entry -; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v2, 0 -; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s15 +; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s17 ; GFX10_DEFAULT-NEXT: s_addc_u32 s1, s1, 0 ; GFX10_DEFAULT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] @@ -520,11 +520,11 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; FLATSCR_GFX10-LABEL: vload2_private: ; FLATSCR_GFX10: ; %bb.0: ; %entry -; FLATSCR_GFX10-NEXT: s_add_u32 s6, s6, s11 -; FLATSCR_GFX10-NEXT: s_addc_u32 s7, s7, 0 -; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; FLATSCR_GFX10-NEXT: s_add_u32 s8, s8, s13 +; FLATSCR_GFX10-NEXT: s_addc_u32 s9, s9, 0 +; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR_GFX10-NEXT: s_mov_b32 s4, 0 ; FLATSCR_GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -556,7 +556,7 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; GFX11-LABEL: vload2_private: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index b17e1a08074117..6571d515cfef2d 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_add_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -40,18 +40,18 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_add_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -75,7 +75,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_multi_use_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -95,7 +95,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_multi_use_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -115,21 +115,21 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_multi_use_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_max_f32_e64 v2, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_multi_use_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -157,7 +157,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_dbg_use_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -173,7 +173,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_dbg_use_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -190,18 +190,18 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_dbg_use_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_dbg_use_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -226,7 +226,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_neg_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -243,7 +243,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_add_neg_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -261,19 +261,19 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_add_neg_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_floor_f32_e32 v1, v1 ; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_neg_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -299,7 +299,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_non_clamp_max_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -316,7 +316,7 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_non_clamp_max_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -334,19 +334,19 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_non_clamp_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_non_clamp_max_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -370,7 +370,7 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; SI-LABEL: v_clamp_add_src_f32_denormals: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -386,7 +386,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_clamp_add_src_f32_denormals: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -403,18 +403,18 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_add_src_f32_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_f32_denormals: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -438,7 +438,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_f16_denorm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -456,7 +456,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_add_src_f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -473,18 +473,18 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_add_src_f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -508,7 +508,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { ; SI-LABEL: v_clamp_add_src_f16_no_denormals: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -526,7 +526,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -543,18 +543,18 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -578,7 +578,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -595,7 +595,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: v_clamp_add_src_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -613,19 +613,19 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_add_src_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp ; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -650,7 +650,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -666,7 +666,7 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_add_src_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -683,18 +683,18 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_add_src_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -718,70 +718,69 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspace(1) %aptr, float %a) #0 { ; SI-LABEL: v_clamp_mac_to_mad: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s0, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mad_f32 v3, s0, s0, v2 clamp +; SI-NEXT: v_mad_f32 v3, s8, s8, v2 clamp ; SI-NEXT: v_add_f32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: v_clamp_mac_to_mad: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_f32 v2, s0, s0, v3 clamp +; GFX8-NEXT: v_mad_f32 v2, s4, s4, v3 clamp ; GFX8-NEXT: v_add_f32_e32 v2, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_clamp_mac_to_mad: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_f32 v2, s0, s0, v1 clamp +; GFX9-NEXT: v_mad_f32 v2, s6, s6, v1 clamp ; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_mac_to_mad: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: v_mul_f32_e64 v2, s0, s0 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_mul_f32_e64 v2, s4, s4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v2, v2, v1 clamp ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid @@ -800,7 +799,7 @@ define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -824,7 +823,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -844,18 +843,18 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -879,7 +878,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { ; SI-LABEL: v_clamp_add_src_v2f16_no_denormals: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -903,7 +902,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -923,18 +922,18 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -958,7 +957,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -990,7 +989,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1012,19 +1011,19 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1050,7 +1049,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1075,7 +1074,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1096,19 +1095,19 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1136,7 +1135,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1161,7 +1160,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1182,19 +1181,19 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1222,7 +1221,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1246,7 +1245,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1266,19 +1265,19 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1305,7 +1304,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1328,7 +1327,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1348,19 +1347,19 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1387,7 +1386,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_no_clamp_add_packed_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1412,7 +1411,7 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_no_clamp_add_packed_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1433,19 +1432,19 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_no_clamp_add_packed_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_no_clamp_add_packed_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1473,7 +1472,7 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 @@ -1497,7 +1496,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1515,20 +1514,20 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 311feafe3f43f7..73ed23ab681f00 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -24,7 +24,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -41,18 +41,18 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -65,7 +65,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -89,7 +89,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -105,7 +105,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_clamp_neg_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -122,18 +122,18 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_neg_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -146,7 +146,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: v_clamp_neg_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -171,7 +171,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -187,7 +187,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: v_clamp_negabs_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -204,18 +204,18 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_negabs_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -228,7 +228,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: v_clamp_negabs_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -255,7 +255,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negzero_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -273,7 +273,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_negzero_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -292,20 +292,20 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_negzero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_negzero_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -319,7 +319,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: v_clamp_negzero_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -347,7 +347,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -365,7 +365,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -384,20 +384,20 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -411,7 +411,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX12-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -436,7 +436,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_multi_use_max_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -457,7 +457,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_multi_use_max_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -478,22 +478,22 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_multi_use_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 0, v1 ; GFX9-NEXT: v_min_f32_e32 v2, 1.0, v1 -; GFX9-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_multi_use_max_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -511,7 +511,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: v_clamp_multi_use_max_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -542,7 +542,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -559,7 +559,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -576,18 +576,18 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -600,7 +600,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -624,7 +624,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -641,7 +641,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_clamp_neg_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -658,18 +658,18 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_neg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -682,7 +682,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: v_clamp_neg_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -707,7 +707,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -724,7 +724,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: v_clamp_negabs_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -741,18 +741,18 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_negabs_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -765,7 +765,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: v_clamp_negabs_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -792,7 +792,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -808,7 +808,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -825,18 +825,18 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -873,7 +873,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -889,7 +889,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_clamp_neg_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -906,18 +906,18 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_neg_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -930,7 +930,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: v_clamp_neg_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -955,7 +955,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -971,7 +971,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: v_clamp_negabs_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -988,18 +988,18 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_negabs_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -1012,7 +1012,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: v_clamp_negabs_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -1039,7 +1039,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1056,7 +1056,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX8-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1074,19 +1074,19 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_brev_b32 s0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_brev_b32 s2, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, s0, 1.0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_med3_f32 v1, s2, 1.0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1099,7 +1099,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX12-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1121,7 +1121,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_aby_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1137,7 +1137,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_aby_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1154,18 +1154,18 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_aby_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_aby_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1178,7 +1178,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_aby_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1200,7 +1200,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_bay_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1216,7 +1216,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_bay_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1233,18 +1233,18 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_bay_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_bay_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_bay_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1279,7 +1279,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_yab_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1295,7 +1295,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_yab_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1312,18 +1312,18 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_yab_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_yab_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1336,7 +1336,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_yab_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1358,7 +1358,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_yba_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1374,7 +1374,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_yba_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1391,18 +1391,18 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_yba_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_yba_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1415,7 +1415,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_yba_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1437,7 +1437,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_ayb_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1453,7 +1453,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_ayb_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1470,18 +1470,18 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_ayb_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_ayb_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1494,7 +1494,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_ayb_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1516,7 +1516,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_bya_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1532,7 +1532,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_bya_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1549,18 +1549,18 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_bya_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_bya_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_bya_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1595,7 +1595,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constants_to_one_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1607,7 +1607,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX8-LABEL: v_clamp_constants_to_one_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1619,7 +1619,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX9-LABEL: v_clamp_constants_to_one_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1628,7 +1628,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX11-LABEL: v_clamp_constants_to_one_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1638,7 +1638,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX12-LABEL: v_clamp_constants_to_one_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1655,7 +1655,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constants_to_zero_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1666,7 +1666,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_clamp_constants_to_zero_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1678,7 +1678,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_clamp_constants_to_zero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1687,7 +1687,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_clamp_constants_to_zero_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1697,7 +1697,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_clamp_constants_to_zero_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1714,7 +1714,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_preserve_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1726,7 +1726,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_clamp_constant_preserve_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0.5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1738,7 +1738,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_clamp_constant_preserve_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1747,7 +1747,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_clamp_constant_preserve_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1757,7 +1757,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_clamp_constant_preserve_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1774,7 +1774,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1786,7 +1786,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1798,7 +1798,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 @@ -1817,7 +1817,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX12-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 @@ -1834,7 +1834,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_qnan_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1845,7 +1845,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX8-LABEL: v_clamp_constant_qnan_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1857,7 +1857,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX9-LABEL: v_clamp_constant_qnan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1866,7 +1866,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX11-LABEL: v_clamp_constant_qnan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1876,7 +1876,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX12-LABEL: v_clamp_constant_qnan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1893,7 +1893,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_snan_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1904,7 +1904,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX8-LABEL: v_clamp_constant_snan_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1916,7 +1916,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX9-LABEL: v_clamp_constant_snan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1925,7 +1925,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX11-LABEL: v_clamp_constant_snan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1935,7 +1935,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX12-LABEL: v_clamp_constant_snan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1956,7 +1956,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1973,7 +1973,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1991,19 +1991,19 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2017,7 +2017,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2042,7 +2042,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { ; GFX6-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2058,7 +2058,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2075,18 +2075,18 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2099,7 +2099,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2124,7 +2124,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2141,7 +2141,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2159,19 +2159,19 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2185,7 +2185,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2209,7 +2209,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2226,7 +2226,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2244,19 +2244,19 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2270,7 +2270,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2295,7 +2295,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2311,7 +2311,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2328,18 +2328,18 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2352,7 +2352,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2374,7 +2374,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2390,7 +2390,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2407,18 +2407,18 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2431,7 +2431,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2453,7 +2453,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2469,7 +2469,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2486,18 +2486,18 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2510,7 +2510,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2532,7 +2532,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2548,7 +2548,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2565,18 +2565,18 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, 1.0, 0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2589,7 +2589,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2611,7 +2611,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2627,7 +2627,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2644,18 +2644,18 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, 0, v1, 1.0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2668,7 +2668,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2690,7 +2690,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2706,7 +2706,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2723,18 +2723,18 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, 1.0, v1, 0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2747,7 +2747,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2769,7 +2769,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 { ; GFX6-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2781,7 +2781,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX8-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2793,7 +2793,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX9-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2802,7 +2802,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0 @@ -2812,7 +2812,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX12-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2829,7 +2829,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 { ; GFX6-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2841,7 +2841,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX8-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800001 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2853,7 +2853,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX9-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2862,7 +2862,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0 @@ -2872,7 +2872,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX12-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2889,7 +2889,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2911,7 +2911,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2930,18 +2930,18 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2954,7 +2954,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: v_clamp_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2978,7 +2978,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_undef_elt: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3004,7 +3004,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_v2f16_undef_elt: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3028,18 +3028,18 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_v2f16_undef_elt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16_undef_elt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3052,7 +3052,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_clamp_v2f16_undef_elt: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3076,7 +3076,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_not_zero: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3100,7 +3100,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: v_clamp_v2f16_not_zero: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3121,20 +3121,20 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_clamp_v2f16_not_zero: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, 2.0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16_not_zero: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3150,7 +3150,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX12-LABEL: v_clamp_v2f16_not_zero: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3177,7 +3177,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_not_one: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3200,7 +3200,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: v_clamp_v2f16_not_one: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3221,20 +3221,20 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_v2f16_not_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, 0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16_not_one: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3250,7 +3250,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX12-LABEL: v_clamp_v2f16_not_one: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3277,7 +3277,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3300,7 +3300,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: v_clamp_neg_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3319,18 +3319,18 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_clamp_neg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_neg_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3343,7 +3343,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: v_clamp_neg_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3368,7 +3368,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3391,7 +3391,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_negabs_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3410,19 +3410,19 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_negabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_negabs_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3436,7 +3436,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_negabs_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3464,7 +3464,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neglo_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3487,7 +3487,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_neglo_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3506,18 +3506,18 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_neglo_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_neglo_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3530,7 +3530,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: v_clamp_neglo_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3557,7 +3557,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neghi_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3579,7 +3579,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_neghi_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3598,18 +3598,18 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_neghi_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_neghi_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3622,7 +3622,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: v_clamp_neghi_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3649,7 +3649,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_shuffle: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3671,7 +3671,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: v_clamp_v2f16_shuffle: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3690,18 +3690,18 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_v2f16_shuffle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16_shuffle: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3714,7 +3714,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX12-LABEL: v_clamp_v2f16_shuffle: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3739,7 +3739,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3765,7 +3765,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3789,18 +3789,18 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3813,7 +3813,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3837,7 +3837,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3863,7 +3863,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3887,18 +3887,18 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3911,7 +3911,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3935,7 +3935,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 ; GFX6-LABEL: v_clamp_diff_source_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_load_dword s2, s[2:3], 0x2 @@ -3952,7 +3952,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_diff_source_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -3971,23 +3971,23 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_diff_source_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_add_f32_e32 v1, s0, v1 -; GFX9-NEXT: v_add_f32_e32 v2, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: v_max_f32_e64 v1, v1, v2 clamp -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] offset:12 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:12 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_diff_source_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -4003,7 +4003,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_clamp_diff_source_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index fb3de211eaeb8e..986dd8a0464244 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -30,7 +30,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -49,20 +49,20 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX10-LABEL: cluster_load_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s4, 8 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s4, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: s_addc_u32 s3, s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: s_add_u32 s4, s0, 8 +; GFX10-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-NEXT: s_add_u32 s6, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_addc_u32 s7, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_add_u32 s0, s0, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s6 +; GFX10-NEXT: v_mov_b32_e32 v5, s7 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s0 ; GFX10-NEXT: s_clause 0x3 @@ -70,16 +70,16 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; GFX10-NEXT: flat_load_dword v9, v[2:3] ; GFX10-NEXT: flat_load_dword v10, v[4:5] ; GFX10-NEXT: flat_load_dword v11, v[6:7] -; GFX10-NEXT: s_add_u32 s0, s6, 8 -; GFX10-NEXT: s_addc_u32 s1, s7, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: s_add_u32 s0, s2, 8 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s6, 16 -; GFX10-NEXT: s_addc_u32 s1, s7, 0 -; GFX10-NEXT: s_add_u32 s2, s6, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: s_addc_u32 s3, s7, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 16 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_add_u32 s2, s2, 24 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 @@ -96,7 +96,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX11-LABEL: cluster_load_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 @@ -155,7 +155,7 @@ bb: define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_valu_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -175,20 +175,20 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX10-LABEL: cluster_load_valu_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s4, 8 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s4, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_addc_u32 s3, s5, 0 -; GFX10-NEXT: s_add_u32 s0, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: s_add_u32 s4, s0, 8 +; GFX10-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_add_u32 s6, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_addc_u32 s7, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_add_u32 s0, s0, 24 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s6 +; GFX10-NEXT: v_mov_b32_e32 v5, s7 ; GFX10-NEXT: flat_load_dword v6, v[2:3] ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 @@ -196,18 +196,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; GFX10-NEXT: flat_load_dword v8, v[0:1] ; GFX10-NEXT: flat_load_dword v9, v[4:5] ; GFX10-NEXT: flat_load_dword v10, v[2:3] -; GFX10-NEXT: s_add_u32 s0, s6, 8 -; GFX10-NEXT: s_addc_u32 s1, s7, 0 -; GFX10-NEXT: s_add_u32 s2, s6, 16 +; GFX10-NEXT: s_add_u32 s0, s2, 8 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: s_add_u32 s4, s2, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: s_addc_u32 s3, s7, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: s_addc_u32 s5, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s6, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: s_addc_u32 s1, s7, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: s_add_u32 s0, s2, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 @@ -223,7 +223,7 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX11-LABEL: cluster_load_valu_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll index 3035a8579c8a6d..ee4a2ed883b638 100644 --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -12,11 +12,11 @@ ; OSABI-AMDHSA-ASM: .section .rodata,"a" ; OSABI-AMDHSA-ASM: .p2align 6 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd -; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10 +; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 12 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel @@ -31,11 +31,11 @@ ; OSABI-AMDHSA-ASM: .section .rodata,"a" ; OSABI-AMDHSA-ASM: .p2align 6 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub -; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10 +; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 12 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index fe17ff169cb14b..b3b26df49910a9 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -10,10 +10,10 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: simple_nested_if: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -33,7 +33,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: v_mov_b32_e32 v2, 1 ; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:4 ; GCN-NEXT: .LBB0_3: ; %bb.outer.end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: v_mov_b32_e32 v0, 3 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 m0, -1 @@ -46,9 +46,9 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-O0-NEXT: s_mov_b32 s14, -1 ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-O0-NEXT: s_add_u32 s12, s12, s9 +; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v4, s0, 0 @@ -172,10 +172,10 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-LABEL: uncollapsable_nested_if: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_4 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -186,7 +186,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 ; GCN-NEXT: buffer_store_dword v4, v[3:4], s[0:3], 0 addr64 -; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_3 ; GCN-NEXT: ; %bb.2: ; %bb.inner.then ; GCN-NEXT: s_mov_b32 s0, s2 @@ -194,14 +194,14 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:4 ; GCN-NEXT: .LBB1_3: ; %bb.inner.end -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_mov_b32 s0, s2 ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 ; GCN-NEXT: .LBB1_4: ; %Flow -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 3 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -215,9 +215,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-O0-NEXT: s_mov_b32 s14, -1 ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-O0-NEXT: s_add_u32 s12, s12, s9 +; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v4, s0, 0 @@ -367,7 +367,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: nested_if_if_else: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -418,9 +418,9 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-O0-NEXT: s_mov_b32 s14, -1 ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-O0-NEXT: s_add_u32 s12, s12, s9 +; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1] ; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane @@ -595,7 +595,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: nested_if_else_if: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -660,9 +660,9 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-O0-NEXT: s_mov_b32 s14, -1 ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-O0-NEXT: s_add_u32 s12, s12, s9 +; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 ; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 2 @@ -887,7 +887,7 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GCN-NEXT: s_cbranch_execz .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %bb.then -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -906,9 +906,9 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-O0-NEXT: s_mov_b32 s14, -1 ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-O0-NEXT: s_add_u32 s12, s12, s9 +; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: v_writelane_b32 v3, s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index b5e0589cf9e466..1778fa42fbf7e9 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: add1: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: add1: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -88,7 +88,7 @@ bb: define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: sub1: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: sub1: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -127,8 +127,8 @@ bb: define amdgpu_kernel void @add_adde(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: add_adde: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -144,13 +144,13 @@ define amdgpu_kernel void @add_adde(ptr addrspace(1) nocapture %arg, i32 %a) { ; ; GFX9-LABEL: add_adde: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v3, vcc ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -171,8 +171,8 @@ bb: define amdgpu_kernel void @adde_add(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: adde_add: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -188,13 +188,13 @@ define amdgpu_kernel void @adde_add(ptr addrspace(1) nocapture %arg, i32 %a) { ; ; GFX9-LABEL: adde_add: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v4, vcc ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -215,8 +215,8 @@ bb: define amdgpu_kernel void @sub_sube(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_sube: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -232,13 +232,13 @@ define amdgpu_kernel void @sub_sube(ptr addrspace(1) nocapture %arg, i32 %a) { ; ; GFX9-LABEL: sub_sube: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -259,8 +259,8 @@ bb: define amdgpu_kernel void @sub_sube_commuted(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_sube_commuted: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -277,15 +277,15 @@ define amdgpu_kernel void @sub_sube_commuted(ptr addrspace(1) nocapture %arg, i3 ; ; GFX9-LABEL: sub_sube_commuted: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x64, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -306,8 +306,8 @@ bb: define amdgpu_kernel void @sube_sub(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sube_sub: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -323,13 +323,13 @@ define amdgpu_kernel void @sube_sub(ptr addrspace(1) nocapture %arg, i32 %a) { ; ; GFX9-LABEL: sube_sub: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -350,8 +350,8 @@ bb: define amdgpu_kernel void @zext_flclass(ptr addrspace(1) nocapture %arg, float %x) { ; GCN-LABEL: zext_flclass: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -367,13 +367,13 @@ define amdgpu_kernel void @zext_flclass(ptr addrspace(1) nocapture %arg, float % ; ; GFX9-LABEL: zext_flclass: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 +; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -392,8 +392,8 @@ bb: define amdgpu_kernel void @sext_flclass(ptr addrspace(1) nocapture %arg, float %x) { ; GCN-LABEL: sext_flclass: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -409,13 +409,13 @@ define amdgpu_kernel void @sext_flclass(ptr addrspace(1) nocapture %arg, float % ; ; GFX9-LABEL: sext_flclass: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 +; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -434,7 +434,7 @@ bb: define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: add_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -450,7 +450,7 @@ define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: add_and: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_max_u32_e32 v1, 1, v1 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 @@ -478,7 +478,7 @@ bb: define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: cmp_sub_sext: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -493,7 +493,7 @@ define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: cmp_sub_sext: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -518,7 +518,7 @@ bb: define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: cmp_sub_zext: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -533,7 +533,7 @@ define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: cmp_sub_zext: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -557,8 +557,8 @@ bb: define amdgpu_kernel void @sub_addcarry(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_addcarry: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -574,15 +574,15 @@ define amdgpu_kernel void @sub_addcarry(ptr addrspace(1) nocapture %arg, i32 %a) ; ; GFX9-LABEL: sub_addcarry: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: @@ -601,8 +601,8 @@ bb: define amdgpu_kernel void @sub_subcarry(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_subcarry: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -618,13 +618,13 @@ define amdgpu_kernel void @sub_subcarry(ptr addrspace(1) nocapture %arg, i32 %a) ; ; GFX9-LABEL: sub_subcarry: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -646,7 +646,7 @@ bb: define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_zext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -665,17 +665,17 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_zext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[4:5] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -695,7 +695,7 @@ bb: define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_sext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -714,17 +714,17 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_sext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[4:5] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll index 5fbcd0bf669995..c17cf1cd6bca49 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -5,12 +5,12 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 { ; CHECK-LABEL: _Z11test_kernelPii: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x2 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 3 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %if.then -; CHECK-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; CHECK-NEXT: s_and_b32 s4, s0, 0xffff ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: s_mul_i32 s6, s4, 0xaaab diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll index 1e2a2dcf1e3339..93b5f155fc81e6 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -4,13 +4,13 @@ define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) { ; GCN-LABEL: vectorLoadCombine: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -37,14 +37,14 @@ entry: define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) { ; GCN-LABEL: vectorLoadShuffle: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s0, 0x7050604 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_mov_b32 s0, 0x7050604 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: v_perm_b32 v2, v2, v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index 456d0ffd48e7f1..c6c0b9cf8f027f 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -26,7 +26,7 @@ define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_copy_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -48,30 +48,31 @@ define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -81,7 +82,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspa ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 @@ -103,7 +104,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -128,7 +129,7 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa ; ; VI-LABEL: test_copy_v4i8_x3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 @@ -162,41 +163,41 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s18, s6 +; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s19, s7 -; SI-NEXT: s_mov_b32 s22, s6 -; SI-NEXT: s_mov_b32 s23, s7 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 -; SI-NEXT: s_mov_b32 s16, s12 -; SI-NEXT: s_mov_b32 s17, s13 -; SI-NEXT: s_mov_b32 s20, s14 -; SI-NEXT: s_mov_b32 s21, s15 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s16, s8 +; SI-NEXT: s_mov_b32 s17, s9 +; SI-NEXT: s_mov_b32 s20, s10 +; SI-NEXT: s_mov_b32 s21, s11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 @@ -206,7 +207,7 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 @@ -240,22 +241,22 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 @@ -271,13 +272,13 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_extra_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -287,7 +288,7 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 @@ -324,7 +325,7 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -363,7 +364,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p ; ; VI-LABEL: test_copy_v4i8_x2_extra_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 @@ -411,7 +412,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v3i8_align4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -431,7 +432,7 @@ define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: test_copy_v3i8_align4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -455,7 +456,7 @@ define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v3i8_align2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -475,7 +476,7 @@ define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: test_copy_v3i8_align2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -500,7 +501,7 @@ define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v3i8_align1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -523,7 +524,7 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: test_copy_v3i8_align1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -550,7 +551,7 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_volatile_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -567,7 +568,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, p ; ; VI-LABEL: test_copy_v4i8_volatile_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -589,7 +590,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, p define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_volatile_store: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -616,7 +617,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out, ; ; VI-LABEL: test_copy_v4i8_volatile_store: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll index 95d28c9749522d..ac9a2794916684 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll @@ -6,46 +6,46 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %e, ptr addrspace(1) %f, ptr addrspace(1) %pout.coerce) { ; RRLIST-LABEL: sccClobber: ; RRLIST: ; %bb.0: ; %entry -; RRLIST-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; RRLIST-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; RRLIST-NEXT: v_mov_b32_e32 v2, 0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) -; RRLIST-NEXT: s_load_dword s16, s[8:9], 0x0 -; RRLIST-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; RRLIST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; RRLIST-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x44 -; RRLIST-NEXT: s_load_dword s17, s[10:11], 0x0 +; RRLIST-NEXT: s_load_dword s16, s[12:13], 0x0 +; RRLIST-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 +; RRLIST-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; RRLIST-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x44 +; RRLIST-NEXT: s_load_dword s17, s[14:15], 0x0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) -; RRLIST-NEXT: s_min_i32 s4, s16, 0 +; RRLIST-NEXT: s_min_i32 s8, s16, 0 ; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; RRLIST-NEXT: s_and_b64 s[2:3], vcc, exec -; RRLIST-NEXT: s_cselect_b32 s2, s16, s17 -; RRLIST-NEXT: s_cmp_eq_u64 s[12:13], s[0:1] -; RRLIST-NEXT: s_cselect_b32 s0, s4, s2 +; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; RRLIST-NEXT: s_and_b64 s[4:5], vcc, exec +; RRLIST-NEXT: s_cselect_b32 s4, s16, s17 +; RRLIST-NEXT: s_cmp_eq_u64 s[2:3], s[0:1] +; RRLIST-NEXT: s_cselect_b32 s0, s8, s4 ; RRLIST-NEXT: v_mov_b32_e32 v0, s0 -; RRLIST-NEXT: global_store_dword v2, v0, s[14:15] +; RRLIST-NEXT: global_store_dword v2, v0, s[6:7] ; RRLIST-NEXT: s_endpgm ; ; FAST-LABEL: sccClobber: ; FAST: ; %bb.0: ; %entry -; FAST-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; FAST-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; FAST-NEXT: v_mov_b32_e32 v2, 0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) -; FAST-NEXT: s_load_dword s16, s[8:9], 0x0 -; FAST-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; FAST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; FAST-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x44 -; FAST-NEXT: s_load_dword s17, s[10:11], 0x0 +; FAST-NEXT: s_load_dword s16, s[12:13], 0x0 +; FAST-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 +; FAST-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; FAST-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x44 +; FAST-NEXT: s_load_dword s17, s[14:15], 0x0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) -; FAST-NEXT: s_min_i32 s4, s16, 0 +; FAST-NEXT: s_min_i32 s8, s16, 0 ; FAST-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; FAST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; FAST-NEXT: s_and_b64 s[2:3], vcc, exec -; FAST-NEXT: s_cselect_b32 s2, s16, s17 -; FAST-NEXT: s_cmp_eq_u64 s[12:13], s[0:1] -; FAST-NEXT: s_cselect_b32 s0, s4, s2 +; FAST-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; FAST-NEXT: s_and_b64 s[4:5], vcc, exec +; FAST-NEXT: s_cselect_b32 s4, s16, s17 +; FAST-NEXT: s_cmp_eq_u64 s[2:3], s[0:1] +; FAST-NEXT: s_cselect_b32 s0, s8, s4 ; FAST-NEXT: v_mov_b32_e32 v0, s0 -; FAST-NEXT: global_store_dword v2, v0, s[14:15] +; FAST-NEXT: global_store_dword v2, v0, s[6:7] ; FAST-NEXT: s_endpgm entry: %i = load i64, ptr addrspace(1) %a, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll index ce181b6223e413..ed0a97c729c9c5 100644 --- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll @@ -4,25 +4,25 @@ define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(4) %addrSrc) { ; GCN-LABEL: copy_to_scc: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:252 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:252 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], vcc -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, 2, 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: global_store_dword v1, v0, s[4:5] +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], vcc +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s2, 2, 3 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: global_store_dword v1, v0, s[0:1] ; GCN-NEXT: s_endpgm entry: ; preds = %1009 %0 = load i32, ptr addrspace(1) %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 09dc6d6bff9e31..5e6152661aeec4 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -176,28 +176,30 @@ bb1: define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 { ; GCN-LABEL: v3i16_registers: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_load_dword s8, s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_load_dword s12, s[8:9], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s8, 0 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_and_b64 vcc, exec, s[8:9] +; GCN-NEXT: s_bitcmp1_b32 s12, 0 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[12:13] ; GCN-NEXT: s_cbranch_vccnz .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %if.else -; GCN-NEXT: s_add_u32 s8, s6, 8 +; GCN-NEXT: s_add_u32 s8, s8, 8 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_addc_u32 s9, s7, 0 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func_v3i16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func_v3i16@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GCN-NEXT: s_branch .LBB4_3 ; GCN-NEXT: .LBB4_2: ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -225,28 +227,30 @@ if.end: ; preds = %if.else, %if.then define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 { ; GCN-LABEL: v3f16_registers: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_load_dword s8, s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_load_dword s12, s[8:9], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s8, 0 -; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GCN-NEXT: s_and_b64 vcc, exec, s[8:9] +; GCN-NEXT: s_bitcmp1_b32 s12, 0 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[12:13] ; GCN-NEXT: s_cbranch_vccnz .LBB5_2 ; GCN-NEXT: ; %bb.1: ; %if.else -; GCN-NEXT: s_add_u32 s8, s6, 8 +; GCN-NEXT: s_add_u32 s8, s8, 8 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_addc_u32 s9, s7, 0 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func_v3f16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func_v3f16@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GCN-NEXT: s_branch .LBB5_3 ; GCN-NEXT: .LBB5_2: ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index a7522ef761b8ab..e1c9fed9df4892 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -23,11 +23,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s2, s4 +; SI-NEXT: s_flbit_i32_b32 s2, s2 ; SI-NEXT: s_min_u32 s4, s2, 32 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -36,12 +36,12 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; ; VI-LABEL: s_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s4, s4 +; VI-NEXT: s_flbit_i32_b32 s4, s6 ; VI-NEXT: s_min_u32 s4, s4, 32 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -62,11 +62,11 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-LABEL: s_ctlz_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b32 s2, s4 +; GFX10-NEXT: s_flbit_i32_b32 s2, s2 ; GFX10-NEXT: s_min_u32 s2, s2, 32 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -75,11 +75,11 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-GISEL-LABEL: s_ctlz_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b32 s2, s4 +; GFX10-GISEL-NEXT: s_flbit_i32_b32 s2, s2 ; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -88,10 +88,10 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX11-LABEL: s_ctlz_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u32 s2, s4 +; GFX11-NEXT: s_clz_i32_u32 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_u32 s2, s2, 32 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -105,7 +105,7 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -125,7 +125,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -162,33 +162,33 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -210,7 +210,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -232,7 +232,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -274,37 +274,37 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -330,7 +330,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -356,7 +356,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -408,11 +408,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v3, v3 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 @@ -422,16 +422,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 @@ -441,12 +441,12 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -477,7 +477,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -497,7 +497,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_ctlz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -547,33 +547,33 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_ctlz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 -; GFX10-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 -; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] @@ -593,12 +593,12 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b64 s4, s[4:5] +; SI-NEXT: s_flbit_i32_b64 s4, s[6:7] ; SI-NEXT: s_min_u32 s4, s4, 64 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -607,13 +607,13 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b64 s4, s[4:5] +; VI-NEXT: s_flbit_i32_b64 s4, s[6:7] ; VI-NEXT: s_min_u32 s4, s4, 64 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -640,21 +640,21 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-LABEL: s_ctlz_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_flbit_i32_b64 s0, s[0:1] ; GFX10-NEXT: s_min_u32 s0, s0, 64 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_ctlz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[0:1] @@ -662,14 +662,14 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: s_ctlz_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u64 s0, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -685,7 +685,7 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -699,7 +699,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -730,29 +730,29 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b64 s0, s[6:7] -; GFX10-NEXT: s_min_u32 s0, s0, 64 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_flbit_i32_b64 s2, s[2:3] +; GFX10-NEXT: s_min_u32 s2, s2, 64 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] -; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] +; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: s_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -769,7 +769,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -790,7 +790,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -838,25 +838,25 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 @@ -864,12 +864,12 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -896,7 +896,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -917,7 +917,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -965,38 +965,38 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 @@ -1024,7 +1024,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1043,7 +1043,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1081,34 +1081,34 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1131,7 +1131,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1150,7 +1150,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1188,34 +1188,34 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1239,7 +1239,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1261,7 +1261,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1304,37 +1304,37 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1361,7 +1361,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1383,7 +1383,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1426,37 +1426,37 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1483,7 +1483,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1501,7 +1501,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1543,22 +1543,22 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1569,12 +1569,12 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_sub_nc_u16 v1, v1, 24 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] @@ -1595,7 +1595,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1613,7 +1613,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1663,25 +1663,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 @@ -1689,12 +1689,12 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo -; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1720,7 +1720,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1739,7 +1739,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1782,23 +1782,23 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1811,12 +1811,12 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 4588bee49f037f..84fcb3718c00ca 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -29,11 +29,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s4 +; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -41,10 +41,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s4 +; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -64,11 +64,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm @@ -80,7 +80,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -99,7 +99,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -134,14 +134,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -154,7 +154,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -174,7 +174,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -211,15 +211,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -232,7 +232,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -254,7 +254,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -295,17 +295,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 -; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -318,11 +318,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s4, 24 +; SI-NEXT: s_lshl_b32 s2, s2, 24 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -331,10 +331,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: s_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 24 +; VI-NEXT: s_lshl_b32 s2, s2, 24 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -373,11 +373,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 24 +; GFX9-GISEL-NEXT: s_lshl_b32 s2, s2, 24 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] @@ -392,11 +392,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s4, 16 +; SI-NEXT: s_lshl_b32 s2, s2, 16 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -405,10 +405,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 16 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -447,11 +447,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 16 +; GFX9-GISEL-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] @@ -466,11 +466,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s4 +; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -478,10 +478,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s4 +; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -501,11 +501,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm @@ -519,7 +519,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -533,7 +533,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -561,14 +561,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i64 %val, 0 @@ -580,7 +580,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -601,7 +601,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -650,17 +650,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone @@ -673,7 +673,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -698,7 +698,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -758,11 +758,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -770,7 +770,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone @@ -783,7 +783,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -814,7 +814,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -874,13 +874,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -891,7 +891,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone @@ -904,7 +904,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -951,7 +951,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1055,17 +1055,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[6:7] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[6:7] offset:4 -; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[6:7] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[6:7] offset:6 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[6:7] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 +; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 +; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) @@ -1086,7 +1086,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v4, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone @@ -1099,7 +1099,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1118,7 +1118,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: v_ctlz_zero_undef_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1163,11 +1163,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1175,7 +1175,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1188,12 +1188,12 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b64 s4, s[4:5] +; SI-NEXT: s_flbit_i32_b64 s4, s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1201,8 +1201,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s0, s[0:1] @@ -1230,15 +1230,15 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[0:1] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) store i64 %ctlz, ptr addrspace(1) %out @@ -1248,7 +1248,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -1261,7 +1261,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: s_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1287,12 +1287,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) %trunc = trunc i64 %ctlz to i32 @@ -1303,7 +1303,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1323,7 +1323,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1369,17 +1369,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -1393,7 +1393,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -1413,7 +1413,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: v_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1459,17 +1459,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1 ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -1484,7 +1484,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1503,7 +1503,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1539,16 +1539,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1563,7 +1563,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1582,7 +1582,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1618,16 +1618,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1642,7 +1642,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1660,7 +1660,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1702,11 +1702,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1717,7 +1717,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa vcc, v0, v1 src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1756,7 +1756,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1805,17 +1805,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1835,7 +1835,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1856,7 +1856,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1894,16 +1894,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1919,7 +1919,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1940,7 +1940,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1978,16 +1978,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -2003,7 +2003,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2024,7 +2024,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2063,16 +2063,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -2088,7 +2088,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2109,7 +2109,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2148,16 +2148,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -2202,11 +2202,11 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) { define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i18: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s4, 14 +; SI-NEXT: s_lshl_b32 s2, s2, 14 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -2219,10 +2219,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i18: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 14 +; VI-NEXT: s_lshl_b32 s2, s2, 14 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2276,11 +2276,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 14 +; GFX9-GISEL-NEXT: s_lshl_b32 s2, s2, 14 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 ; GFX9-GISEL-NEXT: s_and_b32 s2, s2, 0x3ffff ; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index 40929d58834472..17ab8fc780fb41 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -14,12 +14,12 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s6, 0xffff ; SI-NEXT: s_bcnt1_i32_b32 s4, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -27,12 +27,12 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) ; ; VI-LABEL: s_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_and_b32 s4, s6, 0xffff ; VI-NEXT: s_bcnt1_i32_b32 s4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -72,7 +72,7 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -91,7 +91,7 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -142,23 +142,23 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in0, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: v_ctpop_add_chain_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_mov_b64 s[6:7], s[14:15] ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[12:15], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 ; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 @@ -166,25 +166,25 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctpop_add_chain_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_ushort v1, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_add_chain_i16: @@ -239,39 +239,39 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %sval) nounwind { ; SI-LABEL: v_ctpop_add_sgpr_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s12, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s12 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_add_sgpr_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bcnt_u32_b32 v0, v0, s0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_add_sgpr_i16: @@ -320,7 +320,7 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -344,7 +344,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -400,7 +400,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -430,7 +430,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -520,7 +520,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v8i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -562,7 +562,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v16i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -769,7 +769,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: v_ctpop_v16i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal ; ; VI-LABEL: v_ctpop_i16_add_inline_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) ; ; VI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1160,7 +1160,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_literal: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1180,7 +1180,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctpop_i16_add_literal: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_movk_i32 s4, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1234,39 +1234,39 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s12, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s12 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_var: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bcnt_u32_b32 v0, v0, s0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_var: @@ -1315,39 +1315,39 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s12, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s12 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_var_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bcnt_u32_b32 v0, v0, s0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_var_inv: @@ -1396,21 +1396,21 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %constptr) nounwind { ; SI-LABEL: v_ctpop_i16_add_vvar_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_mov_b64 s[6:7], s[14:15] ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[12:15], 0 addr64 -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 @@ -1418,23 +1418,23 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v_ctpop_i16_add_vvar_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v0, v3, v0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_vvar_inv: @@ -1487,11 +1487,11 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %ctpop_arg, i16 %cond) { ; SI-LABEL: ctpop_i16_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: s_cmp_lg_u32 s5, 0 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB14_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1501,7 +1501,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 ; SI-NEXT: s_cbranch_execnz .LBB14_3 ; SI-NEXT: .LBB14_2: ; %if -; SI-NEXT: s_and_b32 s2, s4, 0xffff +; SI-NEXT: s_and_b32 s2, s6, 0xffff ; SI-NEXT: s_bcnt1_i32_b32 s2, s2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 @@ -1517,11 +1517,11 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: ctpop_i16_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s4, 16 -; VI-NEXT: s_cmp_lg_u32 s5, 0 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cbranch_scc0 .LBB14_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -1531,7 +1531,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 ; VI-NEXT: s_cbranch_execnz .LBB14_3 ; VI-NEXT: .LBB14_2: ; %if -; VI-NEXT: s_and_b32 s2, s4, 0xffff +; VI-NEXT: s_and_b32 s2, s6, 0xffff ; VI-NEXT: s_bcnt1_i32_b32 s2, s2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index 1c16612bed37fc..3504546801c93b 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -16,24 +16,24 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctpop_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; SI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; VI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -46,7 +46,7 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -92,45 +92,45 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %s.val) nounwind { ; SI-LABEL: v_ctpop_i64_user: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 ; SI-NEXT: v_bcnt_u32_b32_e32 v0, v1, v0 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_or_b32_e32 v0, s12, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i64_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 ; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_or_b32_e32 v0, s0, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -144,30 +144,30 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) nounwind { ; SI-LABEL: s_ctpop_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; SI-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; VI-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> @@ -178,38 +178,38 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64 define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64> %val) nounwind { ; SI-LABEL: s_ctpop_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bcnt1_i32_b64 s0, s[4:5] -; SI-NEXT: s_bcnt1_i32_b64 s1, s[6:7] -; SI-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; SI-NEXT: s_bcnt1_i32_b64 s3, s[10:11] -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: s_bcnt1_i32_b64 s4, s[8:9] +; SI-NEXT: s_bcnt1_i32_b64 s5, s[10:11] +; SI-NEXT: s_bcnt1_i32_b64 s6, s[12:13] +; SI-NEXT: s_bcnt1_i32_b64 s7, s[14:15] +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5] -; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7] -; VI-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; VI-NEXT: s_bcnt1_i32_b64 s3, s[10:11] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; VI-NEXT: s_bcnt1_i32_b64 s4, s[8:9] +; VI-NEXT: s_bcnt1_i32_b64 s5, s[10:11] +; VI-NEXT: s_bcnt1_i32_b64 s6, s[12:13] +; VI-NEXT: s_bcnt1_i32_b64 s7, s[14:15] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> @@ -220,7 +220,7 @@ define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64 define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -242,7 +242,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -270,7 +270,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -298,7 +298,7 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -334,58 +334,58 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) { ; SI-LABEL: ctpop_i64_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 -; SI-NEXT: s_mov_b64 s[6:7], 0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 +; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccnz .LBB7_3 ; SI-NEXT: .LBB7_2: ; %if -; SI-NEXT: s_bcnt1_i32_b64 s0, s[2:3] -; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] +; SI-NEXT: s_mov_b32 s5, 0 ; SI-NEXT: .LBB7_3: ; %endif -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB7_4: -; SI-NEXT: ; implicit-def: $sgpr0_sgpr1 +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 ; SI-NEXT: s_branch .LBB7_2 ; ; VI-LABEL: ctpop_i64_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dword s8, s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s8, 0 ; VI-NEXT: s_cbranch_scc0 .LBB7_4 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 ; VI-NEXT: s_cbranch_execnz .LBB7_3 ; VI-NEXT: .LBB7_2: ; %if ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bcnt1_i32_b64 s0, s[2:3] -; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] +; VI-NEXT: s_mov_b32 s5, 0 ; VI-NEXT: .LBB7_3: ; %endif ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB7_4: -; VI-NEXT: ; implicit-def: $sgpr0_sgpr1 +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 ; VI-NEXT: s_branch .LBB7_2 entry: %tmp0 = icmp eq i32 %cond, 0 @@ -409,30 +409,30 @@ endif: define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind { ; SI-LABEL: s_ctpop_i128: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; SI-NEXT: s_add_i32 s4, s4, s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; SI-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; SI-NEXT: s_add_i32 s0, s0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; VI-NEXT: s_add_i32 s4, s4, s6 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; VI-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; VI-NEXT: s_add_i32 s0, s0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone %truncctpop = trunc i128 %ctpop to i32 @@ -443,36 +443,36 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) nounwind { ; SI-LABEL: s_ctpop_i65: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_bcnt1_i32_b32 s4, s4 -; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] -; SI-NEXT: s_add_i32 s4, s5, s4 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_and_b32 s0, s8, 0xff +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_bcnt1_i32_b32 s0, s0 +; SI-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; SI-NEXT: s_add_i32 s0, s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctpop_i65: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_and_b32 s4, s8, 0xff -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_bcnt1_i32_b32 s4, s4 -; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] -; VI-NEXT: s_add_i32 s4, s5, s4 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_and_b32 s0, s8, 0xff +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_bcnt1_i32_b32 s0, s0 +; VI-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; VI-NEXT: s_add_i32 s0, s1, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone %truncctpop = trunc i65 %ctpop to i32 @@ -484,7 +484,7 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i128: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -507,7 +507,7 @@ define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index ccd23a91c35733..f0c278a67c8bcc 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -22,11 +22,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s2, s4 +; SI-NEXT: s_ff1_i32_b32 s2, s2 ; SI-NEXT: s_min_u32 s4, s2, 32 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -35,12 +35,12 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; ; VI-LABEL: s_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s4, s4 +; VI-NEXT: s_ff1_i32_b32 s4, s6 ; VI-NEXT: s_min_u32 s4, s4, 32 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -61,11 +61,11 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-LABEL: s_cttz_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b32 s2, s4 +; GFX10-NEXT: s_ff1_i32_b32 s2, s2 ; GFX10-NEXT: s_min_u32 s2, s2, 32 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -74,11 +74,11 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-GISEL-LABEL: s_cttz_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX10-GISEL-NEXT: s_ff1_i32_b32 s2, s2 ; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -91,7 +91,7 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -111,7 +111,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -148,28 +148,28 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -182,7 +182,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -204,7 +204,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -246,32 +246,32 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -284,7 +284,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -310,7 +310,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -362,11 +362,11 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v3, v3 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 @@ -376,16 +376,16 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -427,7 +427,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_cttz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -475,26 +475,26 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_cttz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v1, 0x100, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %valptr %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone @@ -505,12 +505,12 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b64 s4, s[4:5] +; SI-NEXT: s_ff1_i32_b64 s4, s[6:7] ; SI-NEXT: s_min_u32 s4, s4, 64 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -519,13 +519,13 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b64 s4, s[4:5] +; VI-NEXT: s_ff1_i32_b64 s4, s[6:7] ; VI-NEXT: s_min_u32 s4, s4, 64 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -552,21 +552,21 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-LABEL: s_cttz_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX10-NEXT: s_min_u32 s0, s0, 64 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_cttz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[0:1] @@ -574,7 +574,7 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) store i64 %cttz, ptr addrspace(1) %out @@ -584,7 +584,7 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -598,7 +598,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -629,24 +629,24 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b64 s0, s[6:7] -; GFX10-NEXT: s_min_u32 s0, s0, 64 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_ff1_i32_b64 s2, s[2:3] +; GFX10-NEXT: s_min_u32 s2, s2, 64 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[6:7] -; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3] +; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) %trunc = trunc i64 %cttz to i32 @@ -657,7 +657,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -678,7 +678,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -726,25 +726,25 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 @@ -752,7 +752,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -766,7 +766,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -787,7 +787,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -835,33 +835,33 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -876,7 +876,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -895,7 +895,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -933,29 +933,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -970,7 +970,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -989,7 +989,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1027,29 +1027,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1065,7 +1065,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1087,7 +1087,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1130,32 +1130,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1192,7 +1192,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1235,32 +1235,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1275,7 +1275,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1293,7 +1293,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1335,32 +1335,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s0, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, s0 -; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[4:5] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, s2 +; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1375,7 +1375,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1393,7 +1393,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1443,31 +1443,31 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-NEXT: v_cmp_ne_u32_sdwa vcc_lo, v1, v0 src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo -; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %valptr %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone @@ -1481,7 +1481,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1500,7 +1500,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1543,23 +1543,23 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1571,7 +1571,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 4c7c8bc1c027d7..c4a742f4bf08df 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -16,11 +16,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -28,10 +28,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -51,11 +51,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm @@ -67,7 +67,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -86,7 +86,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -121,14 +121,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -141,7 +141,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -198,15 +198,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -219,7 +219,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -241,7 +241,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -282,17 +282,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 -; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -305,11 +305,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -317,10 +317,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: s_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -356,11 +356,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm @@ -374,11 +374,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -386,10 +386,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -425,11 +425,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm @@ -443,11 +443,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -455,10 +455,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -478,11 +478,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm @@ -496,7 +496,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -510,7 +510,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -538,14 +538,14 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s0, s[6:7] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i64 %val, 0 @@ -557,7 +557,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -577,7 +577,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -623,16 +623,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone @@ -645,7 +645,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -669,7 +669,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -723,18 +723,18 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone @@ -747,7 +747,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -778,7 +778,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -838,13 +838,13 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -855,7 +855,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone @@ -868,7 +868,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -915,7 +915,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1019,17 +1019,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[6:7] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[6:7] offset:4 -; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[6:7] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[6:7] offset:6 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[6:7] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 +; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 +; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) @@ -1050,7 +1050,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -1063,7 +1063,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1093,7 +1093,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1154,13 +1154,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1215,7 +1215,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1276,13 +1276,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1294,7 +1294,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1307,7 +1307,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1340,7 +1340,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1406,13 +1406,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1424,7 +1424,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 32, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1437,7 +1437,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1455,7 +1455,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1501,18 +1501,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone @@ -1525,7 +1525,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1547,7 +1547,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1603,12 +1603,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 @@ -1616,7 +1616,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 850e701513fd7a..b897e1feed5d56 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -900,7 +900,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -918,7 +918,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -933,29 +933,29 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; GFX10-LABEL: load_i8_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_i8_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_i8_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] @@ -974,7 +974,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -994,7 +994,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1011,33 +1011,33 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v2i8_to_v2f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v0, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v2i8_to_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v2i8_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1060,7 +1060,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1100,35 +1100,35 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v3i8_to_v3f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v3i8_to_v3f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v3i8_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1151,7 +1151,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1192,37 +1192,37 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v4i8_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1251,7 +1251,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1279,7 +1279,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1310,15 +1310,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 -; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(2) @@ -1327,19 +1327,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 -; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1348,12 +1348,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1386,7 +1386,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -1424,7 +1424,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s8, 0x4000405 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1471,15 +1471,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[8:9] offset:2 -; GFX10-NEXT: global_load_ubyte v3, v0, s[8:9] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[10:11] offset:3 -; GFX10-NEXT: global_load_ubyte v4, v0, s[10:11] offset:2 +; GFX10-NEXT: global_load_ubyte v1, v0, s[12:13] offset:2 +; GFX10-NEXT: global_load_ubyte v3, v0, s[12:13] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[14:15] offset:3 +; GFX10-NEXT: global_load_ubyte v4, v0, s[14:15] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_lshl_or_b32 v5, v3, 8, v1 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 @@ -1489,21 +1489,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_perm_b32 v4, v5, v6, 0x4000405 -; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dword v7, v4, s[6:7] +; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[8:9] +; GFX10-NEXT: global_store_dword v7, v4, s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_mov_b32 s0, 0x4000405 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[8:9] offset:2 -; GFX9-NEXT: global_load_ubyte v2, v0, s[10:11] offset:3 -; GFX9-NEXT: global_load_ubyte v3, v0, s[8:9] offset:3 -; GFX9-NEXT: global_load_ubyte v4, v0, s[10:11] offset:2 +; GFX9-NEXT: global_load_ubyte v1, v0, s[12:13] offset:2 +; GFX9-NEXT: global_load_ubyte v2, v0, s[14:15] offset:3 +; GFX9-NEXT: global_load_ubyte v3, v0, s[12:13] offset:3 +; GFX9-NEXT: global_load_ubyte v4, v0, s[14:15] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshl_or_b32 v6, v3, 8, v1 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 @@ -1513,13 +1513,13 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: v_perm_b32 v4, v6, v7, s0 -; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] -; GFX9-NEXT: global_store_dword v5, v4, s[6:7] +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[8:9] +; GFX9-NEXT: global_store_dword v5, v4, s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1561,21 +1561,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4 @@ -1584,7 +1584,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 @@ -1597,12 +1597,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -1612,7 +1612,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1641,12 +1641,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[0:1] +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 @@ -1664,21 +1664,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dword v4, v5, s[6:7] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dword v4, v5, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 9 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v0, s[0:1] -; GFX9-NEXT: s_movk_i32 s0, 0xff00 -; GFX9-NEXT: s_movk_i32 s1, 0x900 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_movk_i32 s4, 0xff00 +; GFX9-NEXT: s_movk_i32 s5, 0x900 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 @@ -1686,27 +1687,28 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 ; GFX9-NEXT: v_add_u16_e32 v8, 9, v4 -; GFX9-NEXT: v_and_b32_sdwa v9, v4, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: global_store_dword v5, v0, s[6:7] +; GFX9-NEXT: global_store_dword v5, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_add_nc_u16 v2, v0, 9 @@ -1751,7 +1753,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1790,7 +1792,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1835,17 +1837,17 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v7i8_to_v7f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 -; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:6 -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 -; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] offset:1 -; GFX10-NEXT: global_load_short_d16 v7, v0, s[6:7] offset:4 -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(5) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX10-NEXT: s_waitcnt vmcnt(4) @@ -1859,22 +1861,22 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[4:5] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v7i8_to_v7f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:6 -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:4 -; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v7, v0, s[6:7] offset:2 -; GFX9-NEXT: global_load_ubyte v8, v0, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v9, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v7, v0, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v8, v0, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v9, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(4) @@ -1888,13 +1890,13 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 -; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] -; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[4:5] offset:16 +; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[0:1] offset:16 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v7i8_to_v7f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1935,7 +1937,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1962,7 +1964,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1986,11 +1988,11 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v8i8_to_v8f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 @@ -2000,17 +2002,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[4:5] offset:16 -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v8i8_to_v8f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 @@ -2020,13 +2022,13 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[4:5] offset:16 -; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v8i8_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -2057,7 +2059,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2077,7 +2079,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2094,33 +2096,33 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_inreg_i32_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_inreg_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2144,7 +2146,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2163,7 +2165,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2179,31 +2181,31 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2228,7 +2230,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -2246,7 +2248,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2261,29 +2263,29 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: i8_zext_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_i32_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] @@ -2303,7 +2305,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2331,7 +2333,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2362,15 +2364,15 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 -; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(2) @@ -2379,19 +2381,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 -; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -2400,12 +2402,12 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2437,7 +2439,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2456,7 +2458,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2472,31 +2474,31 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte0_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte0_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte0_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2518,7 +2520,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2537,7 +2539,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2553,31 +2555,31 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte1_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2600,7 +2602,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2619,7 +2621,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2635,31 +2637,31 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte2_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte2_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte2_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2682,7 +2684,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2701,7 +2703,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2717,31 +2719,31 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte3_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte3_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte3_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2764,7 +2766,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -2784,7 +2786,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -2804,35 +2806,35 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX10-LABEL: cvt_ubyte0_or_multiuse: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[4:5] +; GFX10-NEXT: global_load_dword v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: global_store_dword v2, v0, s[6:7] +; GFX10-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: cvt_ubyte0_or_multiuse: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[4:5] +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: cvt_ubyte0_or_multiuse: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index 017a1f047bb5f3..9a98a7cd01ed4b 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -8,13 +8,13 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_add v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_add v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -30,13 +30,13 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -52,13 +52,13 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_and v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_and v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -74,13 +74,13 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_or v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_or v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -96,13 +96,13 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -118,28 +118,28 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: nand: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; CHECK-NEXT: s_mov_b64 s[0:1], 0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: .LBB5_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 ; CHECK-NEXT: v_not_b32_e32 v0, v3 ; CHECK-NEXT: v_or_b32_e32 v2, -2, v0 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc +; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz .LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -154,13 +154,13 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: max_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -176,13 +176,13 @@ define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addr define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: max: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -198,13 +198,13 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: min_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -220,13 +220,13 @@ define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addr define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: min: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -242,13 +242,13 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umax_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -264,13 +264,13 @@ define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr add define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -286,13 +286,13 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umin_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -308,13 +308,13 @@ define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr add define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -330,14 +330,14 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: cmpxchg: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -354,13 +354,13 @@ define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace( define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: xchg: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -376,13 +376,13 @@ define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: inc: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_inc v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_inc v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -398,13 +398,13 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: dec: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_dec v2, v0, v1, s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: global_atomic_dec v2, v0, v1, s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 @@ -420,28 +420,28 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fadd: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; CHECK-NEXT: s_mov_b64 s[0:1], 0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: .LBB18_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 ; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc +; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz .LBB18_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -457,28 +457,28 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fsub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; CHECK-NEXT: s_mov_b64 s[0:1], 0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: .LBB19_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 ; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc +; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz .LBB19_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -494,14 +494,14 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fmin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] @@ -520,14 +520,14 @@ define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fmax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[4:5] glc -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] @@ -545,14 +545,14 @@ define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.swap: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: buffer_atomic_swap v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -567,14 +567,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: buffer_atomic_add v0, v1, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -589,14 +589,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: buffer_atomic_sub v0, v1, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -611,14 +611,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.smin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: buffer_atomic_smin v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: buffer_atomic_smin v0, v1, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -633,14 +633,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.smax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: buffer_atomic_smax v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: buffer_atomic_smax v0, v1, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -655,14 +655,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: buffer_atomic_umin v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: buffer_atomic_umin v0, v1, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -677,14 +677,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: buffer_atomic_umax v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: buffer_atomic_umax v0, v1, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -699,14 +699,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: buffer_atomic_and v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: buffer_atomic_and v0, v1, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -721,14 +721,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: buffer_atomic_or v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: buffer_atomic_or v0, v1, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -743,14 +743,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: buffer_atomic_xor v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: buffer_atomic_xor v0, v1, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -765,14 +765,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.inc: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: buffer_atomic_inc v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: buffer_atomic_inc v0, v1, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -787,14 +787,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.dec: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: buffer_atomic_dec v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: buffer_atomic_dec v0, v1, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -809,14 +809,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.cmpswap: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] @@ -832,14 +832,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8) define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fadd: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v1, 1.0 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: buffer_atomic_add_f32 v1, v0, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: buffer_atomic_add_f32 v1, v0, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -857,14 +857,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fmin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] @@ -883,14 +883,14 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fmax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen glc +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll index 89f9583234291b..4b4718a2acb809 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll @@ -42,8 +42,8 @@ define i32 @v_or_i32_disjoint(i32 %a, i32 %b) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: %9:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY %9 + ; CHECK-NEXT: %10:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY %10 ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %result = or disjoint i32 %a, %b ret i32 %result @@ -58,10 +58,10 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) { ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: %11:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec - ; CHECK-NEXT: %12:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY %11 - ; CHECK-NEXT: $vgpr1 = COPY %12 + ; CHECK-NEXT: %12:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec + ; CHECK-NEXT: %13:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY %12 + ; CHECK-NEXT: $vgpr1 = COPY %13 ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 %result = or disjoint <2 x i32> %a, %b ret <2 x i32> %result diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll index 297fe7618672e6..fcb8fa5997b7e8 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) { ; CHECK-LABEL: eggs: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp0_b32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %bb10 -; CHECK-NEXT: global_load_dwordx2 v[8:9], v0, s[8:9] +; CHECK-NEXT: global_load_dwordx2 v[8:9], v0, s[12:13] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v7, 0xff, v8 ; CHECK-NEXT: v_bfe_u32 v6, v8, 8, 8 @@ -33,21 +33,21 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_mov_b32_e32 v7, 0 ; CHECK-NEXT: .LBB0_3: ; %bb41 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x48 -; CHECK-NEXT: v_mov_b32_e32 v8, s10 -; CHECK-NEXT: v_mov_b32_e32 v9, s11 -; CHECK-NEXT: v_mov_b32_e32 v10, s12 -; CHECK-NEXT: v_mov_b32_e32 v11, s13 -; CHECK-NEXT: v_mov_b32_e32 v12, s14 -; CHECK-NEXT: v_mov_b32_e32 v13, s15 -; CHECK-NEXT: v_mov_b32_e32 v14, s16 -; CHECK-NEXT: v_mov_b32_e32 v15, s17 -; CHECK-NEXT: v_mov_b32_e32 v16, s18 -; CHECK-NEXT: v_mov_b32_e32 v17, s19 -; CHECK-NEXT: v_mov_b32_e32 v18, s20 -; CHECK-NEXT: v_mov_b32_e32 v19, s21 -; CHECK-NEXT: v_mov_b32_e32 v20, s22 -; CHECK-NEXT: v_mov_b32_e32 v21, s23 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x48 +; CHECK-NEXT: v_mov_b32_e32 v8, s14 +; CHECK-NEXT: v_mov_b32_e32 v9, s15 +; CHECK-NEXT: v_mov_b32_e32 v10, s16 +; CHECK-NEXT: v_mov_b32_e32 v11, s17 +; CHECK-NEXT: v_mov_b32_e32 v12, s18 +; CHECK-NEXT: v_mov_b32_e32 v13, s19 +; CHECK-NEXT: v_mov_b32_e32 v14, s20 +; CHECK-NEXT: v_mov_b32_e32 v15, s21 +; CHECK-NEXT: v_mov_b32_e32 v16, s22 +; CHECK-NEXT: v_mov_b32_e32 v17, s23 +; CHECK-NEXT: v_mov_b32_e32 v18, s24 +; CHECK-NEXT: v_mov_b32_e32 v19, s25 +; CHECK-NEXT: v_mov_b32_e32 v20, s26 +; CHECK-NEXT: v_mov_b32_e32 v21, s27 ; CHECK-NEXT: flat_store_byte v[8:9], v7 ; CHECK-NEXT: flat_store_byte v[10:11], v6 ; CHECK-NEXT: flat_store_byte v[12:13], v5 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll index 5cadb65c9c942f..012f33952f990e 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll @@ -42,9 +42,9 @@ define amdgpu_kernel void @uniform_opt_lshr_and_cmp(ptr addrspace(1) %out, i32 % ; GCN-LABEL: name: uniform_opt_lshr_and_cmp ; GCN: bb.0.entry: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr2_sgpr3 + ; GCN-NEXT: liveins: $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll index 8fa0068a237cd5..6b1ecc9dffdb70 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @eq_t(float %x) { ; GCN-LABEL: eq_t: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] @@ -21,7 +21,7 @@ define amdgpu_kernel void @eq_t(float %x) { define amdgpu_kernel void @ne_t(float %x) { ; GCN-LABEL: ne_t: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] @@ -38,7 +38,7 @@ define amdgpu_kernel void @ne_t(float %x) { define amdgpu_kernel void @eq_f(float %x) { ; GCN-LABEL: eq_f: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] @@ -55,7 +55,7 @@ define amdgpu_kernel void @eq_f(float %x) { define amdgpu_kernel void @ne_f(float %x) { ; GCN-LABEL: ne_f: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index f139943ff2bcbf..142ec2f926e800 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -7,11 +7,11 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_0_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_lshl_b32 s4, s2, 16 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -19,22 +19,22 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; ; GFX9-LABEL: uniform_vec_0_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, s4, 16 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_0_i16: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_lshl_b32 s2, s4, 16 +; GFX906-NEXT: s_lshl_b32 s2, s2, 16 ; GFX906-NEXT: v_mov_b32_e32 v1, s2 ; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm @@ -42,10 +42,10 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; GFX11-LABEL: uniform_vec_0_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s4, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -90,11 +90,11 @@ define i32 @divergent_vec_0_i16(i16 %a) { define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_i16_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s4, s2, 0xffff ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -102,22 +102,22 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; ; GFX9-LABEL: uniform_vec_i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_0: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX906-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX906-NEXT: v_mov_b32_e32 v1, s2 ; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm @@ -125,10 +125,10 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; GFX11-LABEL: uniform_vec_i16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -173,11 +173,11 @@ define i32 @divergent_vec_i16_0(i16 %a) { define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; GCN-LABEL: uniform_vec_f16_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s4, s2, 0xffff ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -185,22 +185,22 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; ; GFX9-LABEL: uniform_vec_f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_f16_0: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX906-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX906-NEXT: v_mov_b32_e32 v1, s2 ; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm @@ -208,10 +208,10 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; GFX11-LABEL: uniform_vec_f16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -256,7 +256,7 @@ define float @divergent_vec_f16_0(half %a) { define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { ; GCN-LABEL: uniform_vec_i16_LL: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -271,12 +271,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_i16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s0 ; GFX9-NEXT: ;;#ASMEND @@ -284,12 +284,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_i16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX906-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s0 ; GFX906-NEXT: ;;#ASMEND @@ -297,7 +297,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX11-LABEL: uniform_vec_i16_LL: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 @@ -355,7 +355,7 @@ define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) { define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: uniform_vec_i16_LH: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 @@ -370,27 +370,27 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_LH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s0, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_LH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_lh_b32_b16 s0, s6, s7 -; GFX906-NEXT: v_mov_b32_e32 v1, s0 -; GFX906-NEXT: global_store_dword v0, v1, s[4:5] +; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3 +; GFX906-NEXT: v_mov_b32_e32 v1, s2 +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_LH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_pack_lh_b32_b16 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -444,7 +444,7 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) { define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GCN-LABEL: uniform_vec_i16_HH: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -458,27 +458,27 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_HH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s0, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_HH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_hh_b32_b16 s0, s6, s7 -; GFX906-NEXT: v_mov_b32_e32 v1, s0 -; GFX906-NEXT: global_store_dword v0, v1, s[4:5] +; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3 +; GFX906-NEXT: v_mov_b32_e32 v1, s2 +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_HH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_pack_hh_b32_b16 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -536,7 +536,7 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) { define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { ; GCN-LABEL: uniform_vec_f16_LL: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -551,12 +551,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_f16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s0 ; GFX9-NEXT: ;;#ASMEND @@ -564,12 +564,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_f16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX906-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5 ; GFX906-NEXT: ;;#ASMSTART ; GFX906-NEXT: ; use s0 ; GFX906-NEXT: ;;#ASMEND @@ -577,7 +577,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX11-LABEL: uniform_vec_f16_LL: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 @@ -674,10 +674,10 @@ entry: define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ptr addrspace(1) %out) #0 { ; GCN-LABEL: build_vec_v2i16_undeflo_uniform: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u16 v0, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -688,11 +688,11 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ; ; GFX9-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_read_u16_d16 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] @@ -700,11 +700,11 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ; ; GFX906-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX906-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v0, s4 +; GFX906-NEXT: v_mov_b32_e32 v0, s2 ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_store_dword v1, v0, s[0:1] @@ -713,10 +713,10 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ; GFX11-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: ds_load_u16_d16 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll index d99e9699c27894..4c3fd40d7a25ae 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: uniform_sext_in_reg_i8_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_i32 s2, s2, s3 @@ -25,7 +25,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: divergent_sext_in_reg_i8_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -48,7 +48,7 @@ define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(ptr addrspace(1) %out define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: uniform_sext_in_reg_i16_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_i32 s2, s2, s3 @@ -69,7 +69,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, define amdgpu_kernel void @divergent_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: divergent_sext_in_reg_i16_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll index 5b39cc2e185b7d..3303cb86c874e6 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) { ; GCN-LABEL: name: uniform_trunc_i16_to_i1 ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr2_sgpr3 + ; GCN-NEXT: liveins: $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 @@ -58,9 +58,9 @@ define i1 @divergent_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) { define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) { ; GCN-LABEL: name: uniform_trunc_i32_to_i1 ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr2_sgpr3 + ; GCN-NEXT: liveins: $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s64) from %ir.x.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 @@ -110,9 +110,9 @@ define i1 @divergent_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) { define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) { ; GCN-LABEL: name: uniform_trunc_i64_to_i1 ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr2_sgpr3 + ; GCN-NEXT: liveins: $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll index 75d9dd924a4d60..b1664c59a7e4c8 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds1align1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_u8 v0, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds2align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -37,7 +37,7 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds2align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -52,7 +52,7 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds2align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_u16 v0, v0 @@ -68,7 +68,7 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds2align2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_u16 v0, v0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds4align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -104,7 +104,7 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds4align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -130,7 +130,7 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds4align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b32 v0, v0 @@ -146,7 +146,7 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds4align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 @@ -160,7 +160,7 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds4align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -174,7 +174,7 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds4align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b32 v0, v0 @@ -190,7 +190,7 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds4align4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b32 v0, v0 @@ -206,7 +206,7 @@ define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds8align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -234,7 +234,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds8align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds8align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 @@ -291,7 +291,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds8align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4 @@ -311,7 +311,7 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds8align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -331,7 +331,7 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds8align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 @@ -347,7 +347,7 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds8align4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 @@ -363,7 +363,7 @@ define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds8align8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b64 v[0:1], v0 @@ -379,7 +379,7 @@ define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds12align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -415,7 +415,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds12align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -473,7 +473,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds12align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 @@ -489,7 +489,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds12align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 @@ -513,7 +513,7 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds12align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -539,7 +539,7 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds12align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 @@ -555,7 +555,7 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-LABEL: ds12align4: ; ALIGNED: ; %bb.0: -; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -569,7 +569,7 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-SDAG-LABEL: ds12align4: ; UNALIGNED-SDAG: ; %bb.0: -; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -583,7 +583,7 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-GISEL-LABEL: ds12align4: ; UNALIGNED-GISEL: ; %bb.0: -; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0 @@ -599,7 +599,7 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds12align8: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v2 @@ -613,7 +613,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds12align8: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -627,7 +627,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-SDAG-LABEL: ds12align8: ; UNALIGNED-SDAG: ; %bb.0: -; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8 @@ -641,7 +641,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-GISEL-LABEL: ds12align8: ; UNALIGNED-GISEL: ; %bb.0: -; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0 @@ -657,7 +657,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds12align16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b96 v[0:2], v0 @@ -673,7 +673,7 @@ define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %o define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds16align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -716,7 +716,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds16align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -789,7 +789,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds16align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 @@ -805,7 +805,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds16align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 @@ -835,7 +835,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds16align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -867,7 +867,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds16align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 @@ -883,7 +883,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-LABEL: ds16align4: ; ALIGNED: ; %bb.0: -; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -897,7 +897,7 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-SDAG-LABEL: ds16align4: ; UNALIGNED-SDAG: ; %bb.0: -; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3 @@ -911,7 +911,7 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-GISEL-LABEL: ds16align4: ; UNALIGNED-GISEL: ; %bb.0: -; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-GISEL-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 @@ -927,7 +927,7 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds16align8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 @@ -943,7 +943,7 @@ define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align16(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds16align16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b128 v[0:3], v0 diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll index 31bbe6fbbaa143..9712c62166cfee 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: ds_read32_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -47,7 +47,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_20: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -91,7 +91,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_400_back: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -136,7 +136,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_8192: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96 @@ -172,7 +172,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] @@ -206,7 +206,7 @@ bb: } ; GCN-LABEL: ds_read64_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -246,7 +246,7 @@ bb: } ; GCN-LABEL: ds_read64_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] @@ -280,7 +280,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -316,7 +316,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_400_back: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -352,7 +352,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_8192: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96 @@ -379,7 +379,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[BASE:v[0-9]+]], vcc, 4, [[BASE]] @@ -406,7 +406,7 @@ bb: } ; GCN-LABEL: ds_write64_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -437,7 +437,7 @@ bb: } ; GCN-LABEL: ds_write64_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[8:9], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[BASE]], vcc, 8, [[BASE]] diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index 352d55073d6722..8100dc522fd97b 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -54,7 +54,7 @@ entry: define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 { ; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-NEXT: s_mov_b64 vcc, 0 @@ -74,7 +74,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; ; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v0 @@ -91,7 +91,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; ; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b @@ -107,7 +107,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; ; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -135,7 +135,7 @@ entry: define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 { ; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: s_mov_b64 vcc, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -154,7 +154,7 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; ; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v4, 0 @@ -170,7 +170,7 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; ; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b @@ -185,7 +185,7 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; ; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0x7b :: v_dual_mov_b32 v3, 0 @@ -527,7 +527,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 { ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 ; CI-NEXT: s_mov_b64 vcc, 0 @@ -548,7 +548,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, 0x3fb, v0 @@ -566,7 +566,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 @@ -584,7 +584,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0x7b diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index b72cd7e1d1eca4..4d9c85ef99dcd1 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -15,7 +15,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -51,7 +51,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:255 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -64,7 +64,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:255 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -88,7 +88,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:1028 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -102,7 +102,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:1028 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -126,7 +126,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -142,7 +142,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -184,7 +184,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_barrier ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: s_mov_b32 s2, 0 @@ -202,7 +202,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_barrier ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -245,7 +245,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:2 offset1:8 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -261,7 +261,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:8 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -301,7 +301,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -319,7 +319,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -352,7 +352,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -370,7 +370,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) % ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -406,7 +406,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -419,7 +419,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -449,7 +449,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:32 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -463,7 +463,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -487,7 +487,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:32 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -501,7 +501,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -522,9 +522,11 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:34 @@ -535,7 +537,7 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: ds_read_u8 v7, v1 ; CI-NEXT: ds_read_u8 v8, v1 offset:33 ; CI-NEXT: ds_read_u8 v1, v1 offset:35 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 @@ -550,19 +552,17 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v2, v4, v1 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: unaligned_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s2, v0 ; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 ; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:1 ; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:2 @@ -587,12 +587,12 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -612,9 +612,11 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_offset_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:11 @@ -625,7 +627,7 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: ds_read_u8 v7, v1 offset:5 ; CI-NEXT: ds_read_u8 v8, v1 offset:10 ; CI-NEXT: ds_read_u8 v1, v1 offset:12 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 @@ -640,19 +642,17 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v2, v4, v1 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s2, v0 ; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 offset:5 ; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:6 ; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:7 @@ -677,12 +677,12 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 offset:5 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -702,31 +702,31 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_2_simple_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 ; CI-NEXT: ds_read_u16 v2, v1 offset:32 ; CI-NEXT: ds_read_u16 v3, v1 offset:2 ; CI-NEXT: ds_read_u16 v4, v1 ; CI-NEXT: ds_read_u16 v1, v1 offset:34 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_add_f32_e32 v2, v3, v1 -; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 @@ -734,7 +734,7 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 ; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 ; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 @@ -744,12 +744,12 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ; ; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -772,7 +772,7 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0 @@ -785,7 +785,7 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -808,7 +808,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0 @@ -821,7 +821,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -845,7 +845,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[1:2], v0 ; CI-NEXT: ds_read_b64 v[3:4], v0 offset:2056 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -859,7 +859,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:2056 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -880,16 +880,16 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_read2_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v3, vcc, s0, v0 ; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:1 ; CI-NEXT: ds_read2_b32 v[3:4], v3 offset0:14 offset1:15 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] ; CI-NEXT: v_mov_b32_e32 v1, 0 @@ -898,13 +898,13 @@ define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: misaligned_read2_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:14 offset1:15 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -929,7 +929,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[0:1], v0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -941,7 +941,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read_b64 v[0:1], v2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -959,7 +959,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -971,7 +971,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -991,7 +991,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b128 v[0:3], v0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ds_read_b128 v[0:3], v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -1026,7 +1026,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[0:1], v2 offset:16384 ; CI-NEXT: ds_read_b64 v[2:3], v2 offset:32760 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1040,7 +1040,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 offset:16384 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:32760 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -1059,7 +1059,8 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb) #0 { ; CI-LABEL: sgemm_inner_loop_read2_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_lshl_b32 s4, s6, 2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_lshl_b32 s4, s8, 2 ; CI-NEXT: s_add_i32 s5, s4, 0xc20 ; CI-NEXT: s_addk_i32 s4, 0xc60 ; CI-NEXT: v_mov_b32_e32 v0, s5 @@ -1071,16 +1072,12 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; CI-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 ; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 ; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; CI-NEXT: s_waitcnt lgkmcnt(4) +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v0, v0, v1 -; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_add_f32_e32 v0, v0, v2 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 -; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_add_f32_e32 v0, v0, v4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: v_add_f32_e32 v0, v0, v5 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v0, v0, v6 ; CI-NEXT: v_add_f32_e32 v0, v0, v7 ; CI-NEXT: v_add_f32_e32 v0, v0, v8 @@ -1092,8 +1089,7 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; ; GFX9-LABEL: sgemm_inner_loop_read2_sequence: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_lshl_b32 s2, s6, 2 +; GFX9-NEXT: s_lshl_b32 s2, s8, 2 ; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 ; GFX9-NEXT: s_addk_i32 s2, 0xc60 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -1104,12 +1100,16 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; GFX9-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 ; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(4) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 @@ -1163,25 +1163,25 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CI-LABEL: misaligned_read2_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1194,25 +1194,25 @@ define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CI-LABEL: misaligned_read2_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1225,22 +1225,22 @@ define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @ds_read_diff_base_interleaving( ; CI-LABEL: ds_read_diff_base_interleaving: ; CI: ; %bb.0: ; %bb -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v1 -; CI-NEXT: v_add_i32_e32 v3, vcc, s5, v0 -; CI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 -; CI-NEXT: v_add_i32_e32 v6, vcc, s7, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v1 +; CI-NEXT: v_add_i32_e32 v3, vcc, s1, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v1 +; CI-NEXT: v_add_i32_e32 v6, vcc, s3, v0 ; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 ; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 ; CI-NEXT: ds_read2_b32 v[6:7], v6 offset1:4 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_mul_f32_e32 v0, v0, v2 ; CI-NEXT: v_add_f32_e32 v0, 2.0, v0 @@ -1251,29 +1251,28 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; CI-NEXT: v_sub_f32_e32 v0, v0, v1 ; CI-NEXT: v_mul_f32_e32 v1, v5, v7 ; CI-NEXT: v_sub_f32_e32 v0, v0, v1 -; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:40 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: ds_read_diff_base_interleaving: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, s4, v1 -; GFX9-NEXT: v_add_u32_e32 v3, s5, v0 -; GFX9-NEXT: v_add_u32_e32 v4, s6, v1 -; GFX9-NEXT: v_add_u32_e32 v6, s7, v0 +; GFX9-NEXT: v_add_u32_e32 v2, s0, v1 +; GFX9-NEXT: v_add_u32_e32 v3, s1, v0 +; GFX9-NEXT: v_add_u32_e32 v4, s2, v1 +; GFX9-NEXT: v_add_u32_e32 v6, s3, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 ; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[6:7], v6 offset1:4 -; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_f32_e32 v0, 2.0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_f32_e32 v2, v4, v6 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 @@ -1325,29 +1324,31 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac ; CI-NEXT: s_getpc_b64 s[40:41] ; CI-NEXT: s_mov_b32 s40, s0 ; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0 -; CI-NEXT: s_mov_b64 s[10:11], s[4:5] -; CI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x0 -; CI-NEXT: s_load_dword s4, s[2:3], 0x2 +; CI-NEXT: s_mov_b32 s14, s10 ; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_mov_b32 s12, s8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s40, s40, s9 +; CI-NEXT: s_add_u32 s40, s40, s11 +; CI-NEXT: s_mov_b64 s[10:11], s[6:7] +; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 +; CI-NEXT: s_load_dword s6, s[4:5], 0x2 ; CI-NEXT: s_addc_u32 s41, s41, 0 -; CI-NEXT: v_add_i32_e32 v40, vcc, s4, v3 -; CI-NEXT: ds_read_b32 v41, v40 -; CI-NEXT: s_mov_b32 s14, s8 -; CI-NEXT: s_add_u32 s8, s2, 12 +; CI-NEXT: s_add_u32 s8, s4, 12 ; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CI-NEXT: s_addc_u32 s9, s3, 0 +; CI-NEXT: s_mov_b32 s13, s9 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v40, vcc, s6, v3 +; CI-NEXT: ds_read_b32 v41, v40 +; CI-NEXT: s_addc_u32 s9, s5, 0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_mov_b64 s[0:1], s[40:41] ; CI-NEXT: s_mov_b32 s17, void_func_void@abs32@hi ; CI-NEXT: s_mov_b32 s16, void_func_void@abs32@lo ; CI-NEXT: v_or_b32_e32 v31, v0, v2 -; CI-NEXT: s_mov_b32 s12, s6 -; CI-NEXT: s_mov_b32 s13, s7 ; CI-NEXT: s_mov_b64 s[2:3], s[42:43] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_mov_b32 s39, 0xf000 @@ -1364,26 +1365,28 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: s_getpc_b64 s[36:37] ; GFX9-NEXT: s_mov_b32 s36, s0 ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s36, s36, s9 -; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s4 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_add_u32 s8, s4, 12 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s6 ; GFX9-NEXT: ds_read_b32 v42, v41 -; GFX9-NEXT: s_add_u32 s8, s2, 12 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b32 s17, void_func_void@abs32@hi ; GFX9-NEXT: s_mov_b32 s16, void_func_void@abs32@lo ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: v_mov_b32_e32 v40, 0 @@ -1461,7 +1464,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; CI-NEXT: ds_read_u8 v6, v0 offset:66 ; CI-NEXT: ds_read_u8 v0, v0 offset:65 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_or_b32_e32 v1, v2, v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 @@ -1488,7 +1491,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(7) ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v7 @@ -1505,7 +1508,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-UNALIGNED: ; %bb.0: ; %entry ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v2 offset:65 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 9649cdc6001cdd..1f805b6d07f711 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_one_val_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_one_val_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -44,7 +44,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr ad define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -60,7 +60,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_two_val_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -85,7 +85,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr ad define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_volatile_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -105,12 +105,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_f32_volatile_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write_b32 v0, v1 ; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 @@ -131,7 +131,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_volatile_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -151,12 +151,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_f32_volatile_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write_b32 v0, v1 ; GFX9-NEXT: ds_write_b32 v0, v2 offset:32 @@ -182,7 +182,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -199,7 +199,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace ; ; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ; kill: killed $vgpr4 @@ -229,7 +229,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_subreg2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -244,7 +244,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_subreg2_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -268,7 +268,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_subreg4_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 @@ -283,7 +283,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_subreg4_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -307,7 +307,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_max_offset_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -323,7 +323,7 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_max_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -348,7 +348,7 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_too_far_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -368,11 +368,11 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_too_far_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: ds_write_b32 v0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -394,7 +394,7 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_x2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -413,11 +413,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr ; ; GFX9-LABEL: simple_write2_two_val_f32_x2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 @@ -450,7 +450,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -469,11 +469,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa ; ; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27 @@ -506,21 +506,21 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: write2_ptr_subreg_arg_two_val_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x6 -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x6 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[4:5] +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: s_mov_b64 s[2:3], s[10:11] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_mov_b64 s[0:1], s[2:3] +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: v_mov_b32_e32 v1, s12 +; CI-NEXT: v_mov_b32_e32 v1, s8 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: v_mov_b32_e32 v3, s13 +; CI-NEXT: v_mov_b32_e32 v3, s9 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: ds_write_b32 v1, v2 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -529,14 +529,14 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: ds_write_b32 v0, v1 offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -566,7 +566,7 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_one_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -580,7 +580,7 @@ define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_one_val_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -601,8 +601,8 @@ define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr ad define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_simple_write2_one_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 -; CI-NEXT: s_load_dword s4, s[2:3], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -618,12 +618,12 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) ; ; GFX9-LABEL: misaligned_simple_write2_one_val_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 +; GFX9-NEXT: v_add_u32_e32 v2, s2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15 @@ -642,8 +642,8 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_offset_simple_write2_one_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 -; CI-NEXT: s_load_dword s4, s[2:3], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -675,12 +675,12 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; ; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s2, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5 @@ -702,12 +702,12 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s2, v2 ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:5 ; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:9 @@ -726,7 +726,7 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -742,7 +742,7 @@ define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_two_val_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc @@ -868,11 +868,11 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb, ptr addrspace(1) %in) #0 { ; CI-LABEL: write2_sgemm_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s0, s[0:1], 0x0 -; CI-NEXT: s_lshl_b32 s1, s6, 2 +; CI-NEXT: s_lshl_b32 s1, s8, 2 ; CI-NEXT: s_add_i32 s2, s1, 0xc20 ; CI-NEXT: s_addk_i32 s1, 0xc60 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -890,8 +890,8 @@ define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, ; ; GFX9-LABEL: write2_sgemm_sequence: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX9-NEXT: s_lshl_b32 s2, s6, 2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_lshl_b32 s2, s8, 2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_add_i32 s1, s2, 0xc20 @@ -945,8 +945,8 @@ define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_v4f32_superreg_align4: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 -; CI-NEXT: s_load_dword s4, s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -963,11 +963,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dword s6, s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6 -; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s8 +; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1 @@ -979,11 +979,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX9-UNALIGNED-NEXT: s_load_dword s6, s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6 -; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s8 +; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3 diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll index 16f16f56248cbf..2cd3916165fe7c 100644 --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -50,19 +50,19 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v40, v31 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 ; CHECK-NEXT: s_mov_b32 s44, s13 ; CHECK-NEXT: s_mov_b32 s45, s12 -; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] -; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[46:47] ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] -; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39] -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] ; CHECK-NEXT: s_mov_b32 s12, s45 ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 85ed2914b8c7f5..9104dc68eb9b49 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -6,10 +6,10 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3] ; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1] -; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: s_load_dword s14, s[6:7], 0x4 -; CHECK-NEXT: s_add_u32 s24, s24, s13 +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: s_load_dword s6, s[8:9], 0x4 +; CHECK-NEXT: s_add_u32 s24, s24, s15 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s2, 0 @@ -24,7 +24,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_bitcmp1_b32 s1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s14, 8 +; CHECK-NEXT: s_bitcmp1_b32 s6, 8 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[16:17] ; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index d6bde79802847c..598cdddaa53d10 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -93,7 +93,7 @@ bb: define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-LABEL: s_add_co_br_user: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s0, s2, s2 ; GFX7-NEXT: s_cmp_lt_u32 s0, s2 @@ -120,7 +120,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX9-LABEL: s_add_co_br_user: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s0, s2, s2 ; GFX9-NEXT: s_cmp_lt_u32 s0, s2 @@ -146,7 +146,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX10-LABEL: s_add_co_br_user: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s1, s0, s0 ; GFX10-NEXT: s_cmp_lt_u32 s1, s0 @@ -172,7 +172,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX11-LABEL: s_add_co_br_user: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s1, s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index 54ec7578700df8..67f2487aed73af 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -4,15 +4,15 @@ define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float4_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s6, 1 ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 2 +; GCN-NEXT: s_cmp_lg_u32 s6, 2 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[2:3] ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 3 ; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v2, 4.0, v0, vcc @@ -29,15 +29,15 @@ entry: define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: int4_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s6, 1 ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 2 +; GCN-NEXT: s_cmp_lg_u32 s6, 2 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 3 ; GCN-NEXT: v_cndmask_b32_e32 v0, 2, v0, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v2, 4, v0, vcc @@ -54,20 +54,20 @@ entry: define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: double4_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s2, 0x3ff028f5 -; GCN-NEXT: s_mov_b32 s3, 0xc28f5c29 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_mov_b32 s3, 0x3ff028f5 +; GCN-NEXT: s_mov_b32 s4, 0xc28f5c29 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s4, 1 -; GCN-NEXT: s_cselect_b32 s2, s2, 0x3f847ae1 -; GCN-NEXT: s_cselect_b32 s3, s3, 0x47ae147b -; GCN-NEXT: s_cmp_eq_u32 s4, 2 -; GCN-NEXT: s_cselect_b32 s3, 0xe147ae14, s3 -; GCN-NEXT: s_cselect_b32 s2, 0x4000147a, s2 -; GCN-NEXT: s_cmp_eq_u32 s4, 3 -; GCN-NEXT: s_cselect_b32 s2, 0x40100a3d, s2 -; GCN-NEXT: s_cselect_b32 s3, 0x70a3d70a, s3 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s3, s3, 0x3f847ae1 +; GCN-NEXT: s_cselect_b32 s4, s4, 0x47ae147b +; GCN-NEXT: s_cmp_eq_u32 s2, 2 +; GCN-NEXT: s_cselect_b32 s4, 0xe147ae14, s4 +; GCN-NEXT: s_cselect_b32 s3, 0x4000147a, s3 +; GCN-NEXT: s_cmp_eq_u32 s2, 3 +; GCN-NEXT: s_cselect_b32 s2, 0x40100a3d, s3 +; GCN-NEXT: s_cselect_b32 s3, 0x70a3d70a, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -83,8 +83,8 @@ entry: define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: double5_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s2, 0x3ff028f5 ; GCN-NEXT: s_mov_b32 s3, 0xc28f5c29 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -120,12 +120,12 @@ entry: define amdgpu_kernel void @half4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: half4_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s2, 0x40003c00 ; GCN-NEXT: s_mov_b32 s3, 0x44004200 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 4 +; GCN-NEXT: s_lshl_b32 s4, s6, 4 ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -141,10 +141,10 @@ entry: define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float2_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[2:3] @@ -160,14 +160,14 @@ entry: define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: double2_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s2, 0x3ff028f5 -; GCN-NEXT: s_mov_b32 s3, 0xc28f5c29 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_mov_b32 s3, 0x3ff028f5 +; GCN-NEXT: s_mov_b32 s4, 0xc28f5c29 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s4, 1 -; GCN-NEXT: s_cselect_b32 s2, s2, 0x3f847ae1 -; GCN-NEXT: s_cselect_b32 s3, s3, 0x47ae147b +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s2, s3, 0x3f847ae1 +; GCN-NEXT: s_cselect_b32 s3, s4, 0x47ae147b ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -183,34 +183,34 @@ entry: define amdgpu_kernel void @half8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: half8_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GCN-NEXT: v_mov_b32_e32 v1, 0x4000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 2 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, 0x4200 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v1, 0x4400 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 4 +; GCN-NEXT: s_cmp_lg_u32 s2, 4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v1, 0x4500 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 5 +; GCN-NEXT: s_cmp_lg_u32 s2, 5 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v1, 0x4600 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 6 +; GCN-NEXT: s_cmp_lg_u32 s2, 6 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v1, 0x4700 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 7 +; GCN-NEXT: s_cmp_lg_u32 s2, 7 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v1, 0x4800 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -228,23 +228,23 @@ entry: define amdgpu_kernel void @short8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: short8_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s4, 1 -; GCN-NEXT: s_cselect_b32 s2, 2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 2 -; GCN-NEXT: s_cselect_b32 s2, s2, 3 -; GCN-NEXT: s_cmp_lg_u32 s4, 3 -; GCN-NEXT: s_cselect_b32 s2, s2, 4 -; GCN-NEXT: s_cmp_lg_u32 s4, 4 -; GCN-NEXT: s_cselect_b32 s2, s2, 5 -; GCN-NEXT: s_cmp_lg_u32 s4, 5 -; GCN-NEXT: s_cselect_b32 s2, s2, 6 -; GCN-NEXT: s_cmp_lg_u32 s4, 6 -; GCN-NEXT: s_cselect_b32 s2, s2, 7 -; GCN-NEXT: s_cmp_lg_u32 s4, 7 -; GCN-NEXT: s_cselect_b32 s2, s2, 8 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s3, 2, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cselect_b32 s3, s3, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cselect_b32 s3, s3, 4 +; GCN-NEXT: s_cmp_lg_u32 s2, 4 +; GCN-NEXT: s_cselect_b32 s3, s3, 5 +; GCN-NEXT: s_cmp_lg_u32 s2, 5 +; GCN-NEXT: s_cselect_b32 s3, s3, 6 +; GCN-NEXT: s_cmp_lg_u32 s2, 6 +; GCN-NEXT: s_cselect_b32 s3, s3, 7 +; GCN-NEXT: s_cmp_lg_u32 s2, 7 +; GCN-NEXT: s_cselect_b32 s2, s3, 8 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -259,8 +259,8 @@ entry: define amdgpu_kernel void @float8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float8_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: v_mov_b32_e32 v2, 0x40400000 @@ -285,8 +285,8 @@ entry: define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: double8_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s18, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s18, s[4:5], 0x2c ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_mov_b32 s15, 0x40200000 ; GCN-NEXT: s_mov_b32 s13, 0x401c0000 @@ -337,8 +337,8 @@ entry: define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: double7_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s16, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s16, s[4:5], 0x2c ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_mov_b32 s13, 0x401c0000 ; GCN-NEXT: s_mov_b32 s11, 0x40180000 @@ -387,8 +387,8 @@ entry: define amdgpu_kernel void @float16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float16_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: v_mov_b32_e32 v2, 0x40400000 @@ -421,8 +421,8 @@ entry: define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: double15_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s36, 0 ; GCN-NEXT: s_mov_b32 s65, 0x402e0000 ; GCN-NEXT: s_mov_b32 s63, 0x402c0000 @@ -454,7 +454,7 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s62, s36 ; GCN-NEXT: s_mov_b32 s64, s36 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s4, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v31, s67 @@ -503,8 +503,8 @@ entry: define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: double16_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s36, 0 ; GCN-NEXT: s_mov_b32 s67, 0x40300000 ; GCN-NEXT: s_mov_b32 s65, 0x402e0000 @@ -538,7 +538,7 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s64, s36 ; GCN-NEXT: s_mov_b32 s66, s36 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s4, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v31, s67 @@ -587,13 +587,13 @@ entry: define amdgpu_kernel void @float32_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float32_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: v_mov_b32_e32 v2, 0x40400000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: v_mov_b32_e32 v3, 4.0 ; GCN-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; GCN-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -637,12 +637,12 @@ entry: define amdgpu_kernel void @byte8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: byte8_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s2, 0x4030201 ; GCN-NEXT: s_mov_b32 s3, 0x8070605 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 3 +; GCN-NEXT: s_lshl_b32 s4, s6, 3 ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -658,39 +658,39 @@ entry: define amdgpu_kernel void @byte16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: byte16_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s4, 1 -; GCN-NEXT: s_cselect_b32 s2, 2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 2 -; GCN-NEXT: s_cselect_b32 s2, s2, 3 -; GCN-NEXT: s_cmp_lg_u32 s4, 3 -; GCN-NEXT: s_cselect_b32 s2, s2, 4 -; GCN-NEXT: s_cmp_lg_u32 s4, 4 -; GCN-NEXT: s_cselect_b32 s2, s2, 5 -; GCN-NEXT: s_cmp_lg_u32 s4, 5 -; GCN-NEXT: s_cselect_b32 s2, s2, 6 -; GCN-NEXT: s_cmp_lg_u32 s4, 6 -; GCN-NEXT: s_cselect_b32 s2, s2, 7 -; GCN-NEXT: s_cmp_lg_u32 s4, 7 -; GCN-NEXT: s_cselect_b32 s2, s2, 8 -; GCN-NEXT: s_cmp_lg_u32 s4, 8 -; GCN-NEXT: s_cselect_b32 s2, s2, 9 -; GCN-NEXT: s_cmp_lg_u32 s4, 9 -; GCN-NEXT: s_cselect_b32 s2, s2, 10 -; GCN-NEXT: s_cmp_lg_u32 s4, 10 -; GCN-NEXT: s_cselect_b32 s2, s2, 11 -; GCN-NEXT: s_cmp_lg_u32 s4, 11 -; GCN-NEXT: s_cselect_b32 s2, s2, 12 -; GCN-NEXT: s_cmp_lg_u32 s4, 12 -; GCN-NEXT: s_cselect_b32 s2, s2, 13 -; GCN-NEXT: s_cmp_lg_u32 s4, 13 -; GCN-NEXT: s_cselect_b32 s2, s2, 14 -; GCN-NEXT: s_cmp_lg_u32 s4, 14 -; GCN-NEXT: s_cselect_b32 s2, s2, 15 -; GCN-NEXT: s_cmp_lg_u32 s4, 15 -; GCN-NEXT: s_cselect_b32 s2, s2, 16 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s3, 2, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cselect_b32 s3, s3, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cselect_b32 s3, s3, 4 +; GCN-NEXT: s_cmp_lg_u32 s2, 4 +; GCN-NEXT: s_cselect_b32 s3, s3, 5 +; GCN-NEXT: s_cmp_lg_u32 s2, 5 +; GCN-NEXT: s_cselect_b32 s3, s3, 6 +; GCN-NEXT: s_cmp_lg_u32 s2, 6 +; GCN-NEXT: s_cselect_b32 s3, s3, 7 +; GCN-NEXT: s_cmp_lg_u32 s2, 7 +; GCN-NEXT: s_cselect_b32 s3, s3, 8 +; GCN-NEXT: s_cmp_lg_u32 s2, 8 +; GCN-NEXT: s_cselect_b32 s3, s3, 9 +; GCN-NEXT: s_cmp_lg_u32 s2, 9 +; GCN-NEXT: s_cselect_b32 s3, s3, 10 +; GCN-NEXT: s_cmp_lg_u32 s2, 10 +; GCN-NEXT: s_cselect_b32 s3, s3, 11 +; GCN-NEXT: s_cmp_lg_u32 s2, 11 +; GCN-NEXT: s_cselect_b32 s3, s3, 12 +; GCN-NEXT: s_cmp_lg_u32 s2, 12 +; GCN-NEXT: s_cselect_b32 s3, s3, 13 +; GCN-NEXT: s_cmp_lg_u32 s2, 13 +; GCN-NEXT: s_cselect_b32 s3, s3, 14 +; GCN-NEXT: s_cmp_lg_u32 s2, 14 +; GCN-NEXT: s_cselect_b32 s3, s3, 15 +; GCN-NEXT: s_cmp_lg_u32 s2, 15 +; GCN-NEXT: s_cselect_b32 s2, s3, 16 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -705,10 +705,10 @@ entry: define amdgpu_kernel void @bit4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: bit4_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s4, 3 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 ; GCN-NEXT: s_lshr_b32 s2, 0x1000100, s2 ; GCN-NEXT: s_and_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -726,265 +726,265 @@ entry: define amdgpu_kernel void @bit128_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: bit128_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 1 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GCN-NEXT: s_cmp_lg_u32 s4, 2 -; GCN-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 3 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 4 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 5 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 6 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 7 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 8 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 9 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 10 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 11 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 12 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 13 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 14 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 15 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 16 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 17 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 18 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 19 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 20 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 21 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 22 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 23 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 24 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 25 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 26 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 27 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 28 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 29 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 30 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 31 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 32 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 33 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 34 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 35 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 36 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 37 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 38 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 39 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 40 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 41 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 42 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 43 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 44 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 45 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 46 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 47 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 48 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 49 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 50 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 51 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 52 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 53 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 54 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 55 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 56 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 57 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 58 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 59 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 60 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 61 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 62 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s4, 63 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 64 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x41 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x42 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x43 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x44 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x45 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x46 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x47 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x48 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x49 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x4a -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x4b -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x4c -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x4d -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x4e -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x4f -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x50 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x51 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x52 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x53 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x54 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x55 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x56 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x57 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x58 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x59 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x5a -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x5b -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x5c -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x5d -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x5e -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x5f -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x60 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x61 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x62 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x63 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x64 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x65 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x66 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x67 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x68 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x69 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x6a -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x6b -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x6c -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x6d -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x6e -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x6f -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x70 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x71 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x72 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x73 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x74 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x75 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x76 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x77 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x78 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x79 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x7a -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x7b -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x7c -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x7d -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x7e -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s4, 0x7f -; GCN-NEXT: s_cselect_b32 s2, s2, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 4 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 5 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 6 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 7 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 8 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 9 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 10 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 11 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 12 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 13 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 14 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 15 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 16 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 17 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 18 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 19 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 20 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 21 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 22 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 23 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 24 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 25 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 26 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 27 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 28 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 29 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 30 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 31 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 32 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 33 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 34 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 35 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 36 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 37 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 38 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 39 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 40 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 41 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 42 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 43 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 44 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 45 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 46 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 47 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 48 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 49 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 50 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 51 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 52 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 53 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 54 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 55 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 56 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 57 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 58 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 59 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 60 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 61 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 62 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 63 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 64 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x41 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x42 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x43 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x44 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x45 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x46 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x47 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x48 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x49 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4a +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4b +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4c +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4d +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4e +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4f +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x50 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x51 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x52 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x53 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x54 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x55 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x56 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x57 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x58 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x59 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5a +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5b +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5c +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5d +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5e +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5f +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x60 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x61 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x62 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x63 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x64 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x65 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x66 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x67 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x68 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x69 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6a +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6b +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6c +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6d +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6e +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6f +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x70 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x71 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x72 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x73 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x74 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x75 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x76 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x77 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x78 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x79 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7a +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7b +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7c +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7d +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7e +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7f +; GCN-NEXT: s_cselect_b32 s2, s3, 0 ; GCN-NEXT: s_and_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index 858eaace8dcbdf..c3c1540383ec63 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: extract_vector_elt_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: extract_vector_elt_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -36,7 +36,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: extract_vector_elt_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 @@ -60,51 +60,50 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 { ; SI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s0, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s1, s[6:7], 0x0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_lshl_b32 s0, s0, 4 +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_lshl_b32 s4, s4, 4 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s0, s1, s0 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_lshr_b32 s4, s2, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s6, s[6:7], 0x0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_lshl_b32 s4, s8, 4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_lshl_b32 s0, s8, 4 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s6, s4 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_lshr_b32 s0, s2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_lshl_b32 s3, s4, 4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_lshr_b32 s2, s2, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr %elt = extractelement <2 x half> %vec, i32 %idx @@ -115,27 +114,27 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 { ; SI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 -; SI-NEXT: s_load_dword s6, s[6:7], 0x0 -; SI-NEXT: s_mov_b64 s[0:1], s[4:5] +; SI-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_lshr_b32_e32 v0, s6, v0 -; SI-NEXT: buffer_store_short v0, v[1:2], s[0:3], 0 addr64 +; SI-NEXT: v_lshr_b32_e32 v0, s2, v0 +; SI-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -143,7 +142,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dword v2, v[1:2] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -158,14 +157,14 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -189,7 +188,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo) #0 { ; SI-LABEL: extract_vector_elt_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -204,7 +203,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; VI-LABEL: extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -218,7 +217,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; GFX11-LABEL: extract_vector_elt_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -241,11 +240,11 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo, i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 4 +; SI-NEXT: s_lshl_b32 s4, s6, 4 ; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 @@ -256,26 +255,26 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; ; VI-LABEL: dynamic_extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_lshl_b32 s4, s8, 4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_lshl_b32 s0, s8, 4 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: dynamic_extract_vector_elt_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s4, s4, 4 +; GFX11-NEXT: s_lshl_b32 s4, s6, 4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -292,7 +291,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_extractelement_v4f16_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -309,7 +308,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_extractelement_v4f16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -328,7 +327,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_extractelement_v4f16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -351,7 +350,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -372,7 +371,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -395,7 +394,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 @@ -425,7 +424,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) %ptr) #0 { ; SI-LABEL: reduce_load_vector_v8f16_extract_01: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -442,7 +441,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; ; VI-LABEL: reduce_load_vector_v8f16_extract_01: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -459,7 +458,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; ; GFX11-LABEL: reduce_load_vector_v8f16_extract_01: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -484,7 +483,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) %ptr) #0 { ; SI-LABEL: reduce_load_vector_v8f16_extract_23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x1 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -501,7 +500,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; ; VI-LABEL: reduce_load_vector_v8f16_extract_23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -518,7 +517,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; ; GFX11-LABEL: reduce_load_vector_v8f16_extract_23: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -543,18 +542,18 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { ; SI-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; SI-NEXT: v_mov_b32_e32 v5, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[6:7] -; SI-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[4:7], 0 addr64 ; SI-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; SI-NEXT: v_mov_b32_e32 v7, v5 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_cmp_eq_u32 s8, 1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 @@ -590,45 +589,45 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, v[6:7], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_short v0, v[6:7], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; VI-NEXT: v_mov_b32_e32 v6, s5 -; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v0 -; VI-NEXT: s_cmp_eq_u32 s0, 1 +; VI-NEXT: v_mov_b32_e32 v6, s1 +; VI-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; VI-NEXT: s_cmp_eq_u32 s4, 1 ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 2 +; VI-NEXT: s_cmp_eq_u32 s4, 2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 3 +; VI-NEXT: s_cmp_eq_u32 s4, 3 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 4 +; VI-NEXT: s_cmp_eq_u32 s4, 4 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 5 +; VI-NEXT: s_cmp_eq_u32 s4, 5 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 6 +; VI-NEXT: s_cmp_eq_u32 s4, 6 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 7 +; VI-NEXT: s_cmp_eq_u32 s4, 7 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -638,46 +637,45 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; ; GFX11-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v0, s[6:7] -; GFX11-NEXT: s_cmp_eq_u32 s0, 1 +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: s_cmp_eq_u32 s4, 1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 2 +; GFX11-NEXT: s_cmp_eq_u32 s4, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: s_cmp_eq_u32 s0, 3 +; GFX11-NEXT: s_cmp_eq_u32 s4, 3 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 4 +; GFX11-NEXT: s_cmp_eq_u32 s4, 4 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 5 +; GFX11-NEXT: s_cmp_eq_u32 s4, 5 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 6 +; GFX11-NEXT: s_cmp_eq_u32 s4, 6 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: s_cmp_eq_u32 s0, 7 +; GFX11-NEXT: s_cmp_eq_u32 s4, 7 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] +; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -692,19 +690,19 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { ; SI-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 ; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[6:7] -; SI-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[4:7], 0 addr64 ; SI-NEXT: v_lshlrev_b32_e32 v9, 1, v0 ; SI-NEXT: v_mov_b32_e32 v10, v6 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] -; SI-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[0:3], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_cmp_eq_u32 s8, 1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 @@ -777,77 +775,77 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, v[9:10], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_short v0, v[9:10], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v5, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx4 v[1:4], v[5:6] ; VI-NEXT: v_add_u32_e32 v5, vcc, 16, v5 ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; VI-NEXT: flat_load_dwordx4 v[5:8], v[5:6] -; VI-NEXT: v_mov_b32_e32 v10, s5 -; VI-NEXT: v_add_u32_e32 v9, vcc, s4, v0 -; VI-NEXT: s_cmp_eq_u32 s0, 1 +; VI-NEXT: v_mov_b32_e32 v10, s1 +; VI-NEXT: v_add_u32_e32 v9, vcc, s0, v0 +; VI-NEXT: s_cmp_eq_u32 s4, 1 ; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 2 +; VI-NEXT: s_cmp_eq_u32 s4, 2 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 3 +; VI-NEXT: s_cmp_eq_u32 s4, 3 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 4 +; VI-NEXT: s_cmp_eq_u32 s4, 4 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 5 +; VI-NEXT: s_cmp_eq_u32 s4, 5 ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 6 +; VI-NEXT: s_cmp_eq_u32 s4, 6 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 7 +; VI-NEXT: s_cmp_eq_u32 s4, 7 ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 8 +; VI-NEXT: s_cmp_eq_u32 s4, 8 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 9 +; VI-NEXT: s_cmp_eq_u32 s4, 9 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 10 +; VI-NEXT: s_cmp_eq_u32 s4, 10 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 11 +; VI-NEXT: s_cmp_eq_u32 s4, 11 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 12 +; VI-NEXT: s_cmp_eq_u32 s4, 12 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 13 +; VI-NEXT: s_cmp_eq_u32 s4, 13 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 14 +; VI-NEXT: s_cmp_eq_u32 s4, 14 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s0, 15 +; VI-NEXT: s_cmp_eq_u32 s4, 15 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -857,81 +855,80 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; ; GFX11-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] -; GFX11-NEXT: global_load_b128 v[4:7], v4, s[6:7] offset:16 -; GFX11-NEXT: s_cmp_eq_u32 s0, 1 +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] offset:16 +; GFX11-NEXT: s_cmp_eq_u32 s4, 1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 2 +; GFX11-NEXT: s_cmp_eq_u32 s4, 2 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX11-NEXT: s_cmp_eq_u32 s0, 3 +; GFX11-NEXT: s_cmp_eq_u32 s4, 3 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 4 +; GFX11-NEXT: s_cmp_eq_u32 s4, 4 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 5 +; GFX11-NEXT: s_cmp_eq_u32 s4, 5 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 6 +; GFX11-NEXT: s_cmp_eq_u32 s4, 6 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX11-NEXT: s_cmp_eq_u32 s0, 7 +; GFX11-NEXT: s_cmp_eq_u32 s4, 7 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 8 +; GFX11-NEXT: s_cmp_eq_u32 s4, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX11-NEXT: s_cmp_eq_u32 s0, 9 +; GFX11-NEXT: s_cmp_eq_u32 s4, 9 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 10 +; GFX11-NEXT: s_cmp_eq_u32 s4, 10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX11-NEXT: s_cmp_eq_u32 s0, 11 +; GFX11-NEXT: s_cmp_eq_u32 s4, 11 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 12 +; GFX11-NEXT: s_cmp_eq_u32 s4, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX11-NEXT: s_cmp_eq_u32 s0, 13 +; GFX11-NEXT: s_cmp_eq_u32 s4, 13 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, 14 +; GFX11-NEXT: s_cmp_eq_u32 s4, 14 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX11-NEXT: s_cmp_eq_u32 s0, 15 +; GFX11-NEXT: s_cmp_eq_u32 s4, 15 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] +; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index d670d69947361c..a9e9aec96ff98e 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -81,7 +81,7 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(ptr addrspace(1) %out, <3 x ; SI: buffer_store_short ; SI: buffer_store_short -; GFX89-DAG: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[2:3], 0x24 +; GFX89-DAG: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[4:5], 0x24 ; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[#LOAD + 2]] ; GFX89-DAG: buffer_store_short [[VLOAD0]], off ; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[#LOAD + 3]] @@ -100,9 +100,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(ptr addrspace(1) %out, <4 x ; SI: s_load_dwordx2 s ; SI: s_load_dwordx2 s -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[2:3], 0x24 -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[2:3], 0x4c -; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[2:3], 0x54 +; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[4:5], 0x24 +; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[4:5], 0x4c +; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x54 ; GCN-NOT: {{buffer|flat|global}} diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll index a7df29dbf7415c..39649922bd5d98 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v1i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 @@ -16,8 +16,8 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i ; ; VI-LABEL: extract_vector_elt_v1i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -32,8 +32,8 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v2i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s3, s2, 8 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -52,8 +52,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i ; ; VI-LABEL: extract_vector_elt_v2i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -80,8 +80,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v3i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s3, s2, 16 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -100,8 +100,8 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i ; ; VI-LABEL: extract_vector_elt_v3i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -128,8 +128,8 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s3, s2, 16 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -148,8 +148,8 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i ; ; VI-LABEL: extract_vector_elt_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -176,7 +176,7 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v8i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[6:7], 0x0 +; SI-NEXT: s_load_dword s0, s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s0, 16 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -191,7 +191,7 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 { ; ; VI-LABEL: extract_vector_elt_v8i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -213,8 +213,8 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 { define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s2, s[6:7], 0x4 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s3, s2, 16 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -233,8 +233,8 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x ; ; VI-LABEL: extract_vector_elt_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -261,7 +261,7 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v32i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[6:7], 0x0 +; SI-NEXT: s_load_dword s0, s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s0, 16 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -276,7 +276,7 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { ; ; VI-LABEL: extract_vector_elt_v32i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -298,8 +298,8 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v64i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s2, s[6:7], 0x10 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s3, s2, 16 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -318,8 +318,8 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x ; ; VI-LABEL: extract_vector_elt_v64i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -351,9 +351,9 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v2i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0xa -; SI-NEXT: s_load_dword s3, s[6:7], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0xa +; SI-NEXT: s_load_dword s3, s[8:9], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffff ; SI-NEXT: s_lshl_b32 s3, s3, 3 @@ -367,9 +367,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out ; ; VI-LABEL: dynamic_extract_vector_elt_v2i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x4c -; VI-NEXT: s_load_dword s3, s[6:7], 0x28 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x4c +; VI-NEXT: s_load_dword s3, s[8:9], 0x28 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s2, s2, 3 ; VI-NEXT: s_and_b32 s3, s3, 0xffff @@ -388,9 +388,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v3i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x13 -; SI-NEXT: s_load_dword s3, s[6:7], 0xa -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x13 +; SI-NEXT: s_load_dword s3, s[8:9], 0xa +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b32 s2, s2, 3 ; SI-NEXT: s_lshr_b32 s2, s3, s2 @@ -403,9 +403,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out ; ; VI-LABEL: dynamic_extract_vector_elt_v3i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x4c -; VI-NEXT: s_load_dword s3, s[6:7], 0x28 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x4c +; VI-NEXT: s_load_dword s3, s[8:9], 0x28 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s2, s2, 3 ; VI-NEXT: s_lshr_b32 s2, s3, s2 @@ -424,8 +424,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0xc +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_load_dword s4, s[8:9], 0xc ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_lshl_b32 s3, s4, 3 @@ -440,8 +440,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out ; ; VI-LABEL: dynamic_extract_vector_elt_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -463,8 +463,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v8i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_load_dword s4, s[8:9], 0x4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; SI-NEXT: s_lshl_b32 s4, s4, 3 @@ -479,8 +479,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out ; ; VI-LABEL: dynamic_extract_vector_elt_v8i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll index 06da7eea0b47dc..2a847e01fae389 100644 --- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -8,22 +8,22 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @bitcast_int_to_vector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %b) { ; GCN-LABEL: bitcast_int_to_vector_extract_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s12, s[2:3], 0xd -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s12, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_i32_e32 v0, vcc, s12, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -38,22 +38,22 @@ define amdgpu_kernel void @bitcast_int_to_vector_extract_0(ptr addrspace(1) %out define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, double %b) { ; GCN-LABEL: bitcast_fp_to_vector_extract_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] ; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f64 v[0:1], v[0:1], s[12:13] -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid @@ -68,22 +68,22 @@ define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(ptr addrspace(1) %out, define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %b) { ; GCN-LABEL: bitcast_int_to_fpvector_extract_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s12, s[2:3], 0xd -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s12, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_i32_e32 v0, vcc, s12, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -98,7 +98,7 @@ define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(ptr addrspace(1) %o define amdgpu_kernel void @no_extract_volatile_load_extract0(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: no_extract_volatile_load_extract0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -122,7 +122,7 @@ entry: define amdgpu_kernel void @no_extract_volatile_load_extract2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: no_extract_volatile_load_extract2: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -146,17 +146,17 @@ entry: define amdgpu_kernel void @no_extract_volatile_load_dynextract(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) { ; GCN-LABEL: no_extract_volatile_load_dynextract: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s12, s[2:3], 0xd -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s10, s2 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s12, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s8, s6 -; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s12, 1 @@ -168,7 +168,7 @@ define amdgpu_kernel void @no_extract_volatile_load_dynextract(ptr addrspace(1) ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm entry: %vec = load volatile <4 x i32>, ptr addrspace(1) %in diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 986f27b19dba27..7a81af5243ee07 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI-LABEL: s_fabs_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -23,8 +23,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: s_fabs_free_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -35,8 +35,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; GFX9-LABEL: s_fabs_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff @@ -47,10 +47,10 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: s_fabs_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -64,8 +64,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI-LABEL: s_fabs_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -76,8 +76,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: s_fabs_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -88,8 +88,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; GFX9-LABEL: s_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff @@ -100,10 +100,10 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: s_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -116,8 +116,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -128,8 +128,8 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; ; VI-LABEL: s_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -140,8 +140,8 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; ; GFX9-LABEL: s_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -152,10 +152,10 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GFX11-LABEL: s_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -168,7 +168,7 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-LABEL: s_fabs_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -181,7 +181,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; VI-LABEL: s_fabs_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -194,7 +194,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX9-LABEL: s_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff @@ -206,7 +206,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX11-LABEL: s_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff @@ -223,12 +223,12 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) { ; CI-LABEL: fabs_fold_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[8:9], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -239,8 +239,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; ; VI-LABEL: fabs_fold_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -252,8 +252,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; ; GFX9-LABEL: fabs_fold_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -265,13 +265,13 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; GFX11-LABEL: fabs_fold_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f16_e64 v1, |s4|, s2 +; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %in0) @@ -283,7 +283,7 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -297,7 +297,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -311,7 +311,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -322,7 +322,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_fabs_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -344,8 +344,8 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-LABEL: fabs_free_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -356,8 +356,8 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; VI-LABEL: fabs_free_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -368,8 +368,8 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX9-LABEL: fabs_free_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -380,10 +380,10 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX11-LABEL: fabs_free_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -399,7 +399,7 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fabs_fold_self_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -425,7 +425,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: v_fabs_fold_self_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -443,7 +443,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_fabs_fold_self_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -456,7 +456,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_fabs_fold_self_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -480,8 +480,8 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %other.val) #0 { ; CI-LABEL: v_fabs_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -508,8 +508,8 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_fabs_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -529,8 +529,8 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_fabs_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -543,18 +543,17 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: v_fabs_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-NEXT: v_pk_mul_f16 v0, v0, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_pk_mul_f16 v0, v0, s4 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid @@ -569,7 +568,7 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fabs_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -592,7 +591,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; VI-LABEL: v_extract_fabs_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -611,7 +610,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX9-LABEL: v_extract_fabs_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -627,7 +626,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX11-LABEL: v_extract_fabs_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -660,7 +659,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fabs_no_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -678,7 +677,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; VI-LABEL: v_extract_fabs_no_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -696,7 +695,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX9-LABEL: v_extract_fabs_no_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] @@ -710,7 +709,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX11-LABEL: v_extract_fabs_no_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll index f98124fe2ed731..5130ec3bc4dcdc 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll @@ -11,7 +11,7 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone define amdgpu_kernel void @v_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_fabs_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -39,7 +39,7 @@ define amdgpu_kernel void @v_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @fabs_f64(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fabs_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset0_b32 s3, 31 @@ -58,18 +58,18 @@ define amdgpu_kernel void @fabs_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: fabs_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s7, 31 -; SI-NEXT: s_bitset0_b32 s5, 31 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_bitset0_b32 s3, 31 +; SI-NEXT: s_bitset0_b32 s1, 31 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) store <2 x double> %fabs, ptr addrspace(1) %out @@ -79,24 +79,24 @@ define amdgpu_kernel void @fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) { define amdgpu_kernel void @fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: fabs_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s7, 31 -; SI-NEXT: s_bitset0_b32 s11, 31 -; SI-NEXT: s_bitset0_b32 s9, 31 -; SI-NEXT: s_bitset0_b32 s5, 31 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_mov_b32_e32 v6, s6 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: s_and_b32 s4, s11, 0x7fffffff +; SI-NEXT: s_and_b32 s5, s15, 0x7fffffff +; SI-NEXT: s_and_b32 s6, s13, 0x7fffffff +; SI-NEXT: s_and_b32 s7, s9, 0x7fffffff +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; SI-NEXT: s_endpgm %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) @@ -107,15 +107,15 @@ define amdgpu_kernel void @fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) { define amdgpu_kernel void @fabs_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) { ; SI-LABEL: fabs_fold_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mul_f64 v[0:1], |s[6:7]|, v[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mul_f64 v[0:1], |s[8:9]|, v[0:1] ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %in0) @@ -127,15 +127,15 @@ define amdgpu_kernel void @fabs_fold_f64(ptr addrspace(1) %out, [8 x i32], doubl define amdgpu_kernel void @fabs_fn_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) { ; SI-LABEL: fabs_fn_fold_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mul_f64 v[0:1], |s[6:7]|, v[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mul_f64 v[0:1], |s[8:9]|, v[0:1] ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm %fabs = call double @fabs(double %in0) @@ -147,7 +147,7 @@ define amdgpu_kernel void @fabs_fn_fold_f64(ptr addrspace(1) %out, [8 x i32], do define amdgpu_kernel void @fabs_free_f64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: fabs_free_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset0_b32 s3, 31 @@ -167,7 +167,7 @@ define amdgpu_kernel void @fabs_free_f64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: fabs_fn_free_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset0_b32 s3, 31 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index 60e19dcd48f1e6..6bcb086944c919 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fabsf_fn_free: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -21,8 +21,8 @@ define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_fabsf_fn_free: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_bitset0_b32 s2, 31 @@ -39,8 +39,8 @@ define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fabsf_free: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -51,8 +51,8 @@ define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_fabsf_free: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_bitset0_b32 s2, 31 @@ -69,8 +69,8 @@ define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -81,8 +81,8 @@ define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: s_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_bitset0_b32 s2, 31 @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fabs_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-LABEL: fabs_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s3, 31 ; VI-NEXT: s_bitset0_b32 s2, 31 @@ -131,8 +131,8 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-LABEL: fabsf_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -149,10 +149,10 @@ define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-LABEL: fabsf_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: s_bitset0_b32 s3, 31 ; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: s_bitset0_b32 s1, 31 @@ -161,7 +161,7 @@ define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) @@ -172,7 +172,7 @@ define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, float %in1) { ; SI-LABEL: fabsf_fn_fold: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -185,7 +185,7 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa ; ; VI-LABEL: fabsf_fn_fold: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0 @@ -202,7 +202,7 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) { ; SI-LABEL: fabs_fold: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -215,7 +215,7 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i ; ; VI-LABEL: fabs_fold: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0 @@ -232,8 +232,8 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,8 +243,8 @@ define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %i ; ; VI-LABEL: bitpreserve_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_add_f32_e64 v2, |s2|, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll index e3c9e8ccdc3998..a94f27a0332c70 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -9,22 +9,22 @@ define amdgpu_kernel void @fadd_f16( ; SI-LABEL: fadd_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -35,22 +35,22 @@ define amdgpu_kernel void @fadd_f16( ; ; VI-LABEL: fadd_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_mov_b32 s0, s2 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_e32 v0, v0, v1 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 @@ -59,22 +59,22 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-SDAG-LABEL: fadd_f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, -1 -; GFX11-SDAG-NEXT: s_mov_b32 s3, s11 -; GFX11-SDAG-NEXT: s_mov_b32 s2, s10 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s8, s4 -; GFX11-SDAG-NEXT: s_mov_b32 s9, s5 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s7 -; GFX11-SDAG-NEXT: s_mov_b32 s6, s10 ; GFX11-SDAG-NEXT: s_mov_b32 s7, s11 -; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc +; GFX11-SDAG-NEXT: s_mov_b32 s6, s10 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_mov_b32 s8, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s9, s1 +; GFX11-SDAG-NEXT: s_mov_b32 s0, s2 +; GFX11-SDAG-NEXT: s_mov_b32 s1, s3 +; GFX11-SDAG-NEXT: s_mov_b32 s2, s10 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s11 +; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -85,42 +85,42 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-GISEL-LABEL: fadd_f16: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: s_mov_b32 s10, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l -; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX11-FAKE16-SDAG-LABEL: fadd_f16: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1 -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s10, -1 -; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s11 -; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s10 -; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s8, s4 -; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s9, s5 -; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s6 -; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s7 -; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s10 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s11 -; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s2 +; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s3 +; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s10 +; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s11 +; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0 @@ -129,21 +129,21 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-FAKE16-GISEL-LABEL: fadd_f16: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1 -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s10, -1 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-FAKE16-GISEL-NEXT: s_endpgm ; GFX11-LABEL: fadd_f16: ; GFX11: ; %bb.0: ; %entry @@ -186,7 +186,7 @@ entry: define amdgpu_kernel void @fadd_f16_imm_a( ; SI-LABEL: fadd_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -206,7 +206,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; VI-LABEL: fadd_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -224,7 +224,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-SDAG-LABEL: fadd_f16_imm_a: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -242,7 +242,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-GISEL-LABEL: fadd_f16_imm_a: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -256,7 +256,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_a: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -274,7 +274,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_a: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -318,7 +318,7 @@ entry: define amdgpu_kernel void @fadd_f16_imm_b( ; SI-LABEL: fadd_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -338,7 +338,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; VI-LABEL: fadd_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -356,7 +356,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-SDAG-LABEL: fadd_f16_imm_b: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -374,7 +374,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-GISEL-LABEL: fadd_f16_imm_b: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -388,7 +388,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_b: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -406,7 +406,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_b: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -450,21 +450,21 @@ entry: define amdgpu_kernel void @fadd_v2f16( ; SI-LABEL: fadd_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_mov_b64 s[6:7], s[14:15] ; SI-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -484,107 +484,107 @@ define amdgpu_kernel void @fadd_v2f16( ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fadd_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[8:9], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[8:9] -; GFX11-SDAG-NEXT: s_mov_b32 s0, s4 -; GFX11-SDAG-NEXT: s_mov_b32 s1, s5 +; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_pk_add_f16 v0, v1, v0 -; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fadd_v2f16: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[0:1] -; GFX11-GISEL-NEXT: s_mov_b32 s6, -1 -; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[4:5] +; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v1, v0 -; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1 -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[8:9], s[2:3], 0x34 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1 -; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[8:9] -; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4 -; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5 +; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, v1, v0 -; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-SDAG-NEXT: s_endpgm ; ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1 -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1 -; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[0:1] -; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1 -; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[4:5] +; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, v1, v0 -; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-FAKE16-GISEL-NEXT: s_endpgm ; GFX11-LABEL: fadd_v2f16: ; GFX11: ; %bb.0: ; %entry @@ -623,7 +623,7 @@ entry: define amdgpu_kernel void @fadd_v2f16_imm_a( ; SI-LABEL: fadd_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -650,7 +650,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; VI-LABEL: fadd_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -671,7 +671,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-SDAG-LABEL: fadd_v2f16_imm_a: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 @@ -688,7 +688,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-GISEL-LABEL: fadd_v2f16_imm_a: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -703,7 +703,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_a: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 @@ -720,7 +720,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_a: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -762,7 +762,7 @@ entry: define amdgpu_kernel void @fadd_v2f16_imm_b( ; SI-LABEL: fadd_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -789,7 +789,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; VI-LABEL: fadd_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -810,7 +810,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-SDAG-LABEL: fadd_v2f16_imm_b: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 @@ -827,7 +827,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-GISEL-LABEL: fadd_v2f16_imm_b: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -842,7 +842,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_b: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 @@ -859,7 +859,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_b: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index dddd649888af27..6496b70b4d6973 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -73,7 +73,7 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 { define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 { ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-ALIGNED: ; %bb.0: -; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -89,7 +89,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -99,7 +99,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX9-LABEL: global_store_2xi16_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -108,7 +108,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX10-LABEL: global_store_2xi16_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -117,7 +117,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX11-LABEL: global_store_2xi16_align2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -125,7 +125,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX12-LABEL: global_store_2xi16_align2: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -215,7 +215,7 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 { define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 { ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: -; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) @@ -242,7 +242,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -252,7 +252,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX9-LABEL: global_store_2xi16_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -261,7 +261,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX10-LABEL: global_store_2xi16_align1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -270,7 +270,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX11-LABEL: global_store_2xi16_align1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -278,7 +278,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX12-LABEL: global_store_2xi16_align1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -350,7 +350,7 @@ define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 { define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 { ; GFX7-ALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: -; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -360,7 +360,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -370,7 +370,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX9-LABEL: global_store_2xi16_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -379,7 +379,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX10-LABEL: global_store_2xi16_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -388,7 +388,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX11-LABEL: global_store_2xi16_align4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -396,7 +396,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX12-LABEL: global_store_2xi16_align4: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 26580618794d34..3199b76d279fab 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -21,7 +21,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_undef_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_undef_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v0, v0, s[0:1] @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_undef_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -49,7 +49,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_undef_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] @@ -62,7 +62,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -74,7 +74,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] @@ -85,7 +85,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; ; CI-LABEL: v_test_canonicalize_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -98,7 +98,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] @@ -115,10 +115,10 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 { ; VI-LABEL: s_test_canonicalize_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_e64 v2, s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -126,22 +126,22 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ; ; GFX9-LABEL: s_test_canonicalize_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v1, s4, s4 +; GFX9-NEXT: v_max_f16_e64 v1, s2, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; CI-LABEL: s_test_canonicalize_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0xb -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s0, s[4:5], 0xb ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -150,11 +150,11 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ; GFX11-LABEL: s_test_canonicalize_var_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, s4, s4 +; GFX11-NEXT: v_max_f16_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %val = bitcast i16 %val.arg to half @@ -205,7 +205,7 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fabs_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -217,7 +217,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -228,7 +228,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; ; CI-LABEL: v_test_canonicalize_fabs_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -241,7 +241,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -259,7 +259,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -271,7 +271,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -282,7 +282,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; ; CI-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -295,7 +295,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -314,7 +314,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -326,7 +326,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -337,7 +337,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; ; CI-LABEL: v_test_canonicalize_fneg_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -350,7 +350,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -368,7 +368,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #2 { ; VI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -380,7 +380,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; ; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -391,7 +391,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; ; CI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -404,7 +404,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; ; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -422,7 +422,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #2 { ; VI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -434,7 +434,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; ; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -445,7 +445,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; ; CI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -458,7 +458,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; ; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -477,7 +477,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -487,7 +487,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v0, v0, s[0:1] @@ -495,7 +495,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_p0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -505,7 +505,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] @@ -518,7 +518,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -528,7 +528,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -537,7 +537,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_n0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x8000 @@ -547,7 +547,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -560,7 +560,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -570,7 +570,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -579,7 +579,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_p1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3c00 @@ -589,7 +589,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -602,7 +602,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffffbc00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -612,7 +612,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffbc00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -621,7 +621,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_n1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0xbc00 @@ -631,7 +631,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -644,7 +644,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_literal_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -654,7 +654,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -663,7 +663,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; ; CI-LABEL: test_fold_canonicalize_literal_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x4c00 @@ -673,7 +673,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -686,7 +686,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -696,7 +696,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; ; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -705,7 +705,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; ; CI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff @@ -715,7 +715,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; ; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -728,7 +728,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -738,7 +738,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -747,7 +747,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff @@ -757,7 +757,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -770,7 +770,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -780,7 +780,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; ; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -789,7 +789,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; ; CI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff @@ -799,7 +799,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; ; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -812,7 +812,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -822,7 +822,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -831,7 +831,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff @@ -841,7 +841,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -854,7 +854,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -864,7 +864,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -873,7 +873,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_qnan_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7c00 @@ -883,7 +883,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -896,7 +896,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -906,7 +906,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -915,7 +915,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -925,7 +925,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -938,7 +938,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -948,7 +948,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -957,7 +957,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -967,7 +967,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -980,7 +980,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan0_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -990,7 +990,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -999,7 +999,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan0_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1022,7 +1022,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan1_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1041,7 +1041,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan1_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1051,7 +1051,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan2_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1074,7 +1074,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1083,7 +1083,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan2_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1093,7 +1093,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1106,7 +1106,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan3_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1116,7 +1116,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1125,7 +1125,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan3_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1135,7 +1135,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1148,7 +1148,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1166,7 +1166,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_test_canonicalize_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1178,7 +1178,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; ; CI-LABEL: v_test_canonicalize_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1201,7 +1201,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_test_canonicalize_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1222,7 +1222,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fabs_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1240,7 +1240,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1253,7 +1253,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; ; CI-LABEL: v_test_canonicalize_fabs_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1276,7 +1276,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1299,7 +1299,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1317,7 +1317,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1330,7 +1330,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; ; CI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1354,7 +1354,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1378,7 +1378,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1396,7 +1396,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; ; CI-LABEL: v_test_canonicalize_fneg_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1454,12 +1454,12 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 { ; VI-LABEL: s_test_canonicalize_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_max_f16_e64 v0, s2, s2 ; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1469,26 +1469,26 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_test_canonicalize_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v1, s2, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; CI-LABEL: s_test_canonicalize_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0xb +; CI-NEXT: s_load_dword s0, s[4:5], 0xb +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v0, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1498,11 +1498,11 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, ; GFX11-LABEL: s_test_canonicalize_var_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v1, s4, s4 +; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %val = bitcast i32 %val.arg to <2 x half> @@ -1514,7 +1514,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1524,7 +1524,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_p0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -1532,7 +1532,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_p0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -1542,7 +1542,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_p0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1555,7 +1555,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x80008000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1565,7 +1565,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_n0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x80008000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1574,7 +1574,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_n0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x80008000 @@ -1584,7 +1584,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_n0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1597,7 +1597,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3c003c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1607,7 +1607,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_p1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c003c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1616,7 +1616,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_p1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3c003c00 @@ -1626,7 +1626,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_p1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1639,7 +1639,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xbc00bc00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1649,7 +1649,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_n1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00bc00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1658,7 +1658,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_n1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 @@ -1668,7 +1668,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_n1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1681,7 +1681,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_literal_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4c004c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1691,7 +1691,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; ; GFX9-LABEL: test_fold_canonicalize_literal_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c004c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1700,7 +1700,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; ; CI-LABEL: test_fold_canonicalize_literal_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x4c004c00 @@ -1710,7 +1710,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; ; GFX11-LABEL: test_fold_canonicalize_literal_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1723,7 +1723,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1733,7 +1733,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1742,7 +1742,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; ; CI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff03ff @@ -1752,7 +1752,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1765,7 +1765,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1775,7 +1775,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1784,7 +1784,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff03ff @@ -1794,7 +1794,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1817,7 +1817,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1826,7 +1826,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; ; CI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff83ff @@ -1836,7 +1836,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1849,7 +1849,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1859,7 +1859,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1868,7 +1868,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff83ff @@ -1878,7 +1878,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1891,7 +1891,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7c007c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1901,7 +1901,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; ; GFX9-LABEL: test_fold_canonicalize_qnan_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c007c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1910,7 +1910,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; ; CI-LABEL: test_fold_canonicalize_qnan_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7c007c00 @@ -1920,7 +1920,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; ; GFX11-LABEL: test_fold_canonicalize_qnan_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1933,7 +1933,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1943,7 +1943,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1952,7 +1952,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -1962,7 +1962,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1975,7 +1975,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1985,7 +1985,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2004,7 +2004,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2017,7 +2017,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2027,7 +2027,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2036,7 +2036,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2046,7 +2046,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2059,7 +2059,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2069,7 +2069,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2078,7 +2078,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2088,7 +2088,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2101,7 +2101,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2111,7 +2111,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2120,7 +2120,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2130,7 +2130,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2143,7 +2143,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2153,7 +2153,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2162,7 +2162,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2172,7 +2172,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2265,7 +2265,7 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 { define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: s_test_canonicalize_undef_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2275,7 +2275,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: s_test_canonicalize_undef_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -2283,7 +2283,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: s_test_canonicalize_undef_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -2293,7 +2293,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: s_test_canonicalize_undef_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -2565,7 +2565,7 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 { define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: s_test_canonicalize_undef_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2576,7 +2576,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; ; GFX9-LABEL: s_test_canonicalize_undef_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2585,7 +2585,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; ; CI-LABEL: s_test_canonicalize_undef_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 @@ -2596,7 +2596,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; ; GFX11-LABEL: s_test_canonicalize_undef_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index adf1635b29145c..4e12a30c6f6f4f 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -23,7 +23,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -35,7 +35,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -46,7 +46,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -57,7 +57,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX12-LABEL: v_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -74,8 +74,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, float %val) #1 { ; GFX6-LABEL: s_test_canonicalize_var_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[6:7], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s2, s[8:9], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -85,8 +85,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX8-LABEL: s_test_canonicalize_var_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -96,8 +96,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX9-LABEL: s_test_canonicalize_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -107,17 +107,17 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX11-LABEL: s_test_canonicalize_var_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e64 v1, s4, s4 +; GFX11-NEXT: v_max_f32_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 @@ -131,7 +131,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -143,7 +143,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -154,7 +154,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -165,7 +165,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -183,7 +183,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -195,7 +195,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -206,7 +206,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -217,7 +217,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -236,7 +236,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -248,7 +248,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -259,7 +259,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -270,7 +270,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -288,7 +288,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_undef_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -298,7 +298,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: test_fold_canonicalize_undef_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -306,7 +306,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_fold_canonicalize_undef_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -314,7 +314,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: test_fold_canonicalize_undef_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -327,7 +327,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -337,7 +337,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -345,7 +345,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -353,7 +353,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -366,7 +366,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -376,7 +376,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -385,7 +385,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -394,7 +394,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -408,7 +408,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -418,7 +418,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -427,7 +427,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -435,7 +435,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -448,7 +448,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, -1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -458,7 +458,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, -1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -467,7 +467,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -475,7 +475,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -488,7 +488,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_literal_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -498,7 +498,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -507,7 +507,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -515,7 +515,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX12-LABEL: test_fold_canonicalize_literal_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -528,7 +528,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -538,7 +538,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -546,7 +546,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -554,7 +554,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -567,7 +567,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic(ptr addrspace(1) %out) #5 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -578,7 +578,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -588,7 +588,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -597,7 +597,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -611,7 +611,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out(ptr addrspace(1) %out) #6 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -622,7 +622,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -632,7 +632,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -641,7 +641,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -655,7 +655,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in(ptr addrspace(1) %out) #7 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -666,7 +666,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -676,7 +676,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -685,7 +685,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -699,7 +699,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -709,7 +709,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -718,7 +718,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -726,7 +726,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -739,7 +739,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -749,7 +749,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -758,7 +758,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -767,7 +767,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -781,7 +781,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -791,7 +791,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x807fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -800,7 +800,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -808,7 +808,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -821,7 +821,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -831,7 +831,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -840,7 +840,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -848,7 +848,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -861,7 +861,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -871,7 +871,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -880,7 +880,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -888,7 +888,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -901,7 +901,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -911,7 +911,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -920,7 +920,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -928,7 +928,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -941,7 +941,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -951,7 +951,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -960,7 +960,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -968,7 +968,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -981,7 +981,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -991,7 +991,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1000,7 +1000,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1008,7 +1008,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1021,7 +1021,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1040,7 +1040,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1048,7 +1048,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1061,7 +1061,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1071,7 +1071,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1080,7 +1080,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1101,7 +1101,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1113,7 +1113,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1124,7 +1124,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1135,7 +1135,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX12-LABEL: v_test_canonicalize_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1152,7 +1152,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, double %val) #1 { ; GFX6-LABEL: s_test_canonicalize_var_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1162,7 +1162,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX8-LABEL: s_test_canonicalize_var_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX9-LABEL: s_test_canonicalize_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] @@ -1181,7 +1181,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX11-LABEL: s_test_canonicalize_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX12-LABEL: s_test_canonicalize_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], s[2:3], s[2:3] @@ -1204,7 +1204,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1216,7 +1216,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1227,7 +1227,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1238,7 +1238,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1268,7 +1268,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1279,7 +1279,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1290,7 +1290,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1309,7 +1309,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1321,7 +1321,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1343,7 +1343,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1361,7 +1361,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1381,7 +1381,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 @@ -1391,7 +1391,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 @@ -1406,7 +1406,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1417,7 +1417,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1426,7 +1426,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1435,7 +1435,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1449,7 +1449,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1460,7 +1460,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1469,7 +1469,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1477,7 +1477,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1490,7 +1490,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1501,7 +1501,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1510,7 +1510,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1518,7 +1518,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1531,7 +1531,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_literal_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1542,7 +1542,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1551,7 +1551,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1559,7 +1559,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX12-LABEL: test_fold_canonicalize_literal_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1572,7 +1572,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #2 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 @@ -1602,7 +1602,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 @@ -1617,7 +1617,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 ; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1628,7 +1628,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xfffff @@ -1638,7 +1638,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1647,7 +1647,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1661,7 +1661,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #2 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1672,7 +1672,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1681,7 +1681,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1690,7 +1690,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1704,7 +1704,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1715,7 +1715,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x800fffff @@ -1725,7 +1725,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1734,7 +1734,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1748,7 +1748,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1759,7 +1759,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1768,7 +1768,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1776,7 +1776,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1789,7 +1789,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1800,7 +1800,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1809,7 +1809,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1817,7 +1817,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1830,7 +1830,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1841,7 +1841,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1850,7 +1850,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1858,7 +1858,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1871,7 +1871,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1882,7 +1882,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1891,7 +1891,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1899,7 +1899,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1912,7 +1912,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1923,7 +1923,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1932,7 +1932,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1940,7 +1940,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1953,7 +1953,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1964,7 +1964,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1973,7 +1973,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1981,7 +1981,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2005,7 +2005,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2014,7 +2014,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2022,7 +2022,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2035,7 +2035,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f64_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2052,7 +2052,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f64_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2069,7 +2069,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f64_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -2080,7 +2080,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f64_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2093,7 +2093,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f64_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2115,7 +2115,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f32_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2132,7 +2132,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f32_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2149,7 +2149,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f32_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2160,7 +2160,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f32_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2173,7 +2173,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f32_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2195,7 +2195,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f16_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2213,7 +2213,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f16_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2230,7 +2230,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f16_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -2241,7 +2241,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f16_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2254,7 +2254,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f16_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2277,7 +2277,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_v2f16_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2300,7 +2300,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX8-LABEL: test_canonicalize_value_v2f16_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2320,7 +2320,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX9-LABEL: test_canonicalize_value_v2f16_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2331,7 +2331,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX11-LABEL: test_canonicalize_value_v2f16_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2344,7 +2344,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX12-LABEL: test_canonicalize_value_v2f16_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2366,7 +2366,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f64_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2383,7 +2383,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f64_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2400,7 +2400,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f64_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -2411,7 +2411,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f64_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2424,7 +2424,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f64_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2446,7 +2446,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f32_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2463,7 +2463,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f32_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2480,7 +2480,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f32_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2491,7 +2491,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f32_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2504,7 +2504,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f32_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2527,7 +2527,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f16_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2545,7 +2545,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2562,7 +2562,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -2573,7 +2573,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2586,7 +2586,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f16_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2610,7 +2610,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2633,7 +2633,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX8-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2652,7 +2652,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX9-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2663,7 +2663,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX11-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2676,7 +2676,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX12-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2698,7 +2698,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) #1 { ; GFX6-LABEL: v_test_canonicalize_var_v2f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2715,7 +2715,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_test_canonicalize_var_v2f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2732,7 +2732,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_test_canonicalize_var_v2f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2745,7 +2745,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_test_canonicalize_var_v2f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -2760,7 +2760,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_test_canonicalize_var_v2f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll index 59e52a86a2f5e8..b58996d656ecec 100644 --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -6,23 +6,23 @@ define amdgpu_kernel void @fcmp_f16_lt( ; SI-LABEL: fcmp_f16_lt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 @@ -32,48 +32,48 @@ define amdgpu_kernel void @fcmp_f16_lt( ; ; VI-LABEL: fcmp_f16_lt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_lt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -93,23 +93,23 @@ entry: define amdgpu_kernel void @fcmp_f16_lt_abs( ; SI-LABEL: fcmp_f16_lt_abs: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 @@ -119,51 +119,51 @@ define amdgpu_kernel void @fcmp_f16_lt_abs( ; ; VI-LABEL: fcmp_f16_lt_abs: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], |v0|, |v1| ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_lt_abs: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 -; GFX11-NEXT: v_cmp_lt_f16_e64 s0, |v0|, |v1| +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_cmp_lt_f16_e64 s2, |v0|, |v1| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, s2 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -183,23 +183,23 @@ entry: define amdgpu_kernel void @fcmp_f16_eq( ; SI-LABEL: fcmp_f16_eq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 @@ -209,48 +209,48 @@ define amdgpu_kernel void @fcmp_f16_eq( ; ; VI-LABEL: fcmp_f16_eq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_eq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -270,23 +270,23 @@ entry: define amdgpu_kernel void @fcmp_f16_le( ; SI-LABEL: fcmp_f16_le: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 @@ -296,48 +296,48 @@ define amdgpu_kernel void @fcmp_f16_le( ; ; VI-LABEL: fcmp_f16_le: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_le_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_le: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -357,23 +357,23 @@ entry: define amdgpu_kernel void @fcmp_f16_gt( ; SI-LABEL: fcmp_f16_gt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 @@ -383,48 +383,48 @@ define amdgpu_kernel void @fcmp_f16_gt( ; ; VI-LABEL: fcmp_f16_gt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_gt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -444,23 +444,23 @@ entry: define amdgpu_kernel void @fcmp_f16_lg( ; SI-LABEL: fcmp_f16_lg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 @@ -470,48 +470,48 @@ define amdgpu_kernel void @fcmp_f16_lg( ; ; VI-LABEL: fcmp_f16_lg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_lg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -531,23 +531,23 @@ entry: define amdgpu_kernel void @fcmp_f16_ge( ; SI-LABEL: fcmp_f16_ge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 @@ -557,48 +557,48 @@ define amdgpu_kernel void @fcmp_f16_ge( ; ; VI-LABEL: fcmp_f16_ge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_ge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -618,23 +618,23 @@ entry: define amdgpu_kernel void @fcmp_f16_o( ; SI-LABEL: fcmp_f16_o: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 @@ -644,48 +644,48 @@ define amdgpu_kernel void @fcmp_f16_o( ; ; VI-LABEL: fcmp_f16_o: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_o: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -705,23 +705,23 @@ entry: define amdgpu_kernel void @fcmp_f16_u( ; SI-LABEL: fcmp_f16_u: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 @@ -731,48 +731,48 @@ define amdgpu_kernel void @fcmp_f16_u( ; ; VI-LABEL: fcmp_f16_u: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_u_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_u: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -792,23 +792,23 @@ entry: define amdgpu_kernel void @fcmp_f16_nge( ; SI-LABEL: fcmp_f16_nge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 @@ -818,48 +818,48 @@ define amdgpu_kernel void @fcmp_f16_nge( ; ; VI-LABEL: fcmp_f16_nge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -879,23 +879,23 @@ entry: define amdgpu_kernel void @fcmp_f16_nlg( ; SI-LABEL: fcmp_f16_nlg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 @@ -905,48 +905,48 @@ define amdgpu_kernel void @fcmp_f16_nlg( ; ; VI-LABEL: fcmp_f16_nlg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nlg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -966,23 +966,23 @@ entry: define amdgpu_kernel void @fcmp_f16_ngt( ; SI-LABEL: fcmp_f16_ngt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 @@ -992,48 +992,48 @@ define amdgpu_kernel void @fcmp_f16_ngt( ; ; VI-LABEL: fcmp_f16_ngt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_ngt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -1053,23 +1053,23 @@ entry: define amdgpu_kernel void @fcmp_f16_nle( ; SI-LABEL: fcmp_f16_nle: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 @@ -1079,48 +1079,48 @@ define amdgpu_kernel void @fcmp_f16_nle( ; ; VI-LABEL: fcmp_f16_nle: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nle: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -1140,23 +1140,23 @@ entry: define amdgpu_kernel void @fcmp_f16_neq( ; SI-LABEL: fcmp_f16_neq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 @@ -1166,48 +1166,48 @@ define amdgpu_kernel void @fcmp_f16_neq( ; ; VI-LABEL: fcmp_f16_neq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_neq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -1227,23 +1227,23 @@ entry: define amdgpu_kernel void @fcmp_f16_nlt( ; SI-LABEL: fcmp_f16_nlt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 @@ -1253,48 +1253,48 @@ define amdgpu_kernel void @fcmp_f16_nlt( ; ; VI-LABEL: fcmp_f16_nlt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nlt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -1314,21 +1314,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_lt( ; SI-LABEL: fcmp_v2f16_lt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1346,21 +1346,21 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; ; VI-LABEL: fcmp_v2f16_lt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1369,27 +1369,27 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_lt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1417,21 +1417,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_eq( ; SI-LABEL: fcmp_v2f16_eq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1449,21 +1449,21 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; ; VI-LABEL: fcmp_v2f16_eq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1472,27 +1472,27 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_eq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1519,21 +1519,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_le( ; SI-LABEL: fcmp_v2f16_le: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1551,21 +1551,21 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; ; VI-LABEL: fcmp_v2f16_le: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1574,27 +1574,27 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_le_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_le: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1621,21 +1621,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_gt( ; SI-LABEL: fcmp_v2f16_gt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1653,21 +1653,21 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; ; VI-LABEL: fcmp_v2f16_gt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1676,27 +1676,27 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_gt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1724,21 +1724,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_lg( ; SI-LABEL: fcmp_v2f16_lg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1756,21 +1756,21 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; ; VI-LABEL: fcmp_v2f16_lg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1779,27 +1779,27 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_lg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1827,21 +1827,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_ge( ; SI-LABEL: fcmp_v2f16_ge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1859,21 +1859,21 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; ; VI-LABEL: fcmp_v2f16_ge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1882,27 +1882,27 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_ge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1930,21 +1930,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_o( ; SI-LABEL: fcmp_v2f16_o: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1962,21 +1962,21 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; ; VI-LABEL: fcmp_v2f16_o: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1985,27 +1985,27 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_o: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2033,21 +2033,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_u( ; SI-LABEL: fcmp_v2f16_u: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2065,21 +2065,21 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; ; VI-LABEL: fcmp_v2f16_u: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2088,27 +2088,27 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_u_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_u: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2135,21 +2135,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nge( ; SI-LABEL: fcmp_v2f16_nge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2167,21 +2167,21 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; ; VI-LABEL: fcmp_v2f16_nge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2190,27 +2190,27 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2237,21 +2237,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nlg( ; SI-LABEL: fcmp_v2f16_nlg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2269,21 +2269,21 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; ; VI-LABEL: fcmp_v2f16_nlg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2292,27 +2292,27 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nlg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2340,21 +2340,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_ngt( ; SI-LABEL: fcmp_v2f16_ngt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2372,21 +2372,21 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; ; VI-LABEL: fcmp_v2f16_ngt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2395,27 +2395,27 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_ngt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2442,21 +2442,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nle( ; SI-LABEL: fcmp_v2f16_nle: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2474,21 +2474,21 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; ; VI-LABEL: fcmp_v2f16_nle: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2497,27 +2497,27 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nle: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2544,21 +2544,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_neq( ; SI-LABEL: fcmp_v2f16_neq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2576,21 +2576,21 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; ; VI-LABEL: fcmp_v2f16_neq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2599,27 +2599,27 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_neq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2646,21 +2646,21 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nlt( ; SI-LABEL: fcmp_v2f16_nlt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2678,21 +2678,21 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; ; VI-LABEL: fcmp_v2f16_nlt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2701,27 +2701,27 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nlt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 13367d3bb36e29..a4573388731578 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -15,14 +15,14 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, half %sign) { ; SI-LABEL: s_copysign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: s_lshr_b32 s0, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_bfi_b32 v0, s2, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 @@ -32,14 +32,14 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, ; ; VI-LABEL: s_copysign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_movk_i32 s3, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s4, 16 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 +; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_bfi_b32 v2, s3, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -47,29 +47,29 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, ; ; GFX9-LABEL: s_copysign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_movk_i32 s3, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_bfi_b32 v1, s3, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %out = call half @llvm.copysign.f16(half %mag, half %sign) @@ -80,22 +80,22 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0x7fff +; SI-NEXT: s_and_b32 s4, s6, 0x7fff ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -104,11 +104,11 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; ; GFX9-LABEL: s_test_copysign_f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -116,10 +116,10 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; GFX11-LABEL: s_test_copysign_f16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -132,22 +132,22 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0x7fff +; SI-NEXT: s_and_b32 s4, s6, 0x7fff ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -156,11 +156,11 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; ; GFX9-LABEL: s_test_copysign_f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -168,10 +168,10 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; GFX11-LABEL: s_test_copysign_f16_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -184,22 +184,22 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_10.0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0x7fff +; SI-NEXT: s_and_b32 s4, s6, 0x7fff ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_10.0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -208,11 +208,11 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_10.0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -220,10 +220,10 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; GFX11-LABEL: s_test_copysign_f16_10.0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -236,22 +236,22 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 15 +; SI-NEXT: s_or_b32 s4, s6, 0x8000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x8000 +; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -260,11 +260,11 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX9-NEXT: s_bitset1_b32 s2, 15 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -272,10 +272,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; GFX11-LABEL: s_test_copysign_f16_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -288,22 +288,22 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 15 +; SI-NEXT: s_or_b32 s4, s6, 0x8000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x8000 +; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -312,11 +312,11 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_neg10: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX9-NEXT: s_bitset1_b32 s2, 15 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -324,10 +324,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; GFX11-LABEL: s_test_copysign_f16_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -340,12 +340,12 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_bfi_b32 v0, s2, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 @@ -355,10 +355,10 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x8000 +; VI-NEXT: s_and_b32 s2, s2, 0x8000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -367,11 +367,11 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_0_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX9-NEXT: s_and_b32 s2, s2, 0x8000 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -379,10 +379,10 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half ; GFX11-LABEL: s_test_copysign_f16_0_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -396,12 +396,12 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_bfi_b32 v0, s2, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 @@ -411,10 +411,10 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x8000 +; VI-NEXT: s_and_b32 s2, s2, 0x8000 ; VI-NEXT: s_or_b32 s2, s2, 0x3c00 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -424,11 +424,11 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_1_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX9-NEXT: s_and_b32 s2, s2, 0x8000 ; GFX9-NEXT: s_or_b32 s2, s2, 0x3c00 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] @@ -437,10 +437,10 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half ; GFX11-LABEL: s_test_copysign_f16_1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x3c00 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -454,13 +454,13 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_mov_b32_e32 v1, 0x41200000 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 @@ -470,10 +470,10 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal ; ; VI-LABEL: s_test_copysign_f16_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x8000 +; VI-NEXT: s_and_b32 s2, s2, 0x8000 ; VI-NEXT: s_or_b32 s2, s2, 0x4900 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -483,11 +483,11 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal ; ; GFX9-LABEL: s_test_copysign_f16_10_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX9-NEXT: s_and_b32 s2, s2, 0x8000 ; GFX9-NEXT: s_or_b32 s2, s2, 0x4900 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] @@ -496,10 +496,10 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal ; GFX11-LABEL: s_test_copysign_f16_10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x4900 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -513,12 +513,12 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_bfi_b32 v0, s2, -1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 @@ -528,10 +528,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h ; ; VI-LABEL: s_test_copysign_f16_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x8000 +; VI-NEXT: s_and_b32 s2, s2, 0x8000 ; VI-NEXT: s_or_b32 s2, s2, 0x3c00 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -541,11 +541,11 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h ; ; GFX9-LABEL: s_test_copysign_f16_neg1_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX9-NEXT: s_and_b32 s2, s2, 0x8000 ; GFX9-NEXT: s_or_b32 s2, s2, 0x3c00 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] @@ -554,10 +554,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h ; GFX11-LABEL: s_test_copysign_f16_neg1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x3c00 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -571,13 +571,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_mov_b32_e32 v1, 0xc1200000 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 @@ -587,10 +587,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f16_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x8000 +; VI-NEXT: s_and_b32 s2, s2, 0x8000 ; VI-NEXT: s_or_b32 s2, s2, 0x4900 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -600,11 +600,11 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_test_copysign_f16_neg10_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX9-NEXT: s_and_b32 s2, s2, 0x8000 ; GFX9-NEXT: s_or_b32 s2, s2, 0x4900 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] @@ -613,10 +613,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, ; GFX11-LABEL: s_test_copysign_f16_neg10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x4900 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -808,23 +808,23 @@ define half @v_test_copysign_f16_neg10(half %mag) { define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_mov_b64 s[6:7], s[14:15] ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; SI-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -834,22 +834,22 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_ushort v2, v[1:2] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_brev_b32 s0, -2 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -859,40 +859,40 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] -; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v1, s[6:7] -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: global_load_u16 v1, v1, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v1, v0 -; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid @@ -908,23 +908,23 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_mov_b64 s[6:7], s[14:15] ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[1:2], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 @@ -934,22 +934,22 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_ushort v2, v[1:2] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_brev_b32 s0, -2 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 @@ -959,42 +959,41 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v1, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v1, s[2:3] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] -; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v2 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_bfi_b32 v3, s0, v3, v1 -; GFX9-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5] +; GFX9-NEXT: v_bfi_b32 v3, s2, v3, v1 +; GFX9-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v2, v1, s[6:7] -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: global_load_u16 v2, v1, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v1 -; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] +; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid @@ -1010,23 +1009,23 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: buffer_load_dword v3, v[1:2], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_mov_b64 s[6:7], s[14:15] ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; SI-NEXT: buffer_load_ushort v0, v[1:2], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ushort v0, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_bfi_b32 v0, s0, v3, v0 @@ -1035,22 +1034,22 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_brev_b32 s0, -2 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1060,33 +1059,33 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[0:1] -; GFX9-NEXT: s_brev_b32 s0, -2 -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v1, s[4:5] +; GFX11-NEXT: global_load_u16 v1, v1, s[6:7] ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1109,24 +1108,24 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_mov_b64 s[6:7], s[14:15] ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[2:3], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64 +; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1136,22 +1135,22 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_brev_b32 s0, -2 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1161,33 +1160,33 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v1, s[0:1] -; GFX9-NEXT: s_brev_b32 s0, -2 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: global_load_ushort v2, v1, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 -; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v2, v1, s[4:5] +; GFX11-NEXT: global_load_u16 v2, v1, s[6:7] ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -1210,49 +1209,49 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_mov_b64 s[6:7], s[14:15] ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; SI-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 -; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 +; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_movk_i32 s0, 0x7fff -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1262,33 +1261,33 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v1, s[0:1] -; GFX9-NEXT: s_movk_i32 s0, 0x7fff -; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: global_load_dword v1, v1, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] +; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX9-NEXT: global_store_short v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v1, s[4:5] +; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -1311,48 +1310,48 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v0, s2, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ushort v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_movk_i32 s0, 0x7fff -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfi_b32 v2, s0, v3, v2 @@ -1361,32 +1360,32 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, s0, v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -1409,51 +1408,51 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: buffer_load_dword v3, v[1:2], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_mov_b64 s[6:7], s[14:15] ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; SI-NEXT: buffer_load_ushort v0, v[1:2], s[0:3], 0 addr64 -; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: buffer_load_ushort v0, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 +; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dword v2, v[1:2] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_movk_i32 s0, 0x7fff -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1463,40 +1462,40 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v1, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v1, v1, s[2:3] +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] -; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] +; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX9-NEXT: global_store_short v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] -; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX11-NEXT: global_load_u16 v0, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 -; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] +; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid @@ -1512,10 +1511,10 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) %arg_out, double %mag, half %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; SI-NEXT: s_lshr_b32 s4, s3, 8 ; SI-NEXT: s_and_b32 s5, s3, 0x1ff ; SI-NEXT: s_and_b32 s6, s4, 0xffe @@ -1576,181 +1575,181 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; ; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s7, 8 -; VI-NEXT: s_and_b32 s1, s7, 0x1ff -; VI-NEXT: s_and_b32 s2, s0, 0xffe -; VI-NEXT: s_or_b32 s0, s1, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s0, s3, 8 +; VI-NEXT: s_and_b32 s1, s3, 0x1ff +; VI-NEXT: s_and_b32 s5, s0, 0xffe +; VI-NEXT: s_or_b32 s0, s1, s2 ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: s_bfe_u32 s1, s3, 0xb0014 ; VI-NEXT: v_readfirstlane_b32 s0, v2 -; VI-NEXT: s_bfe_u32 s1, s7, 0xb0014 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_or_b32 s4, s2, s0 ; VI-NEXT: s_sub_i32 s2, 0x3f1, s1 +; VI-NEXT: s_or_b32 s5, s5, s0 ; VI-NEXT: v_med3_i32 v2, s2, 0, 13 -; VI-NEXT: s_or_b32 s0, s4, 0x1000 +; VI-NEXT: s_or_b32 s0, s5, 0x1000 ; VI-NEXT: v_readfirstlane_b32 s2, v2 ; VI-NEXT: s_lshr_b32 s2, s0, s2 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-NEXT: s_add_i32 s5, s1, 0xfffffc10 +; VI-NEXT: s_add_i32 s6, s1, 0xfffffc10 ; VI-NEXT: v_readfirstlane_b32 s0, v2 -; VI-NEXT: s_lshl_b32 s1, s5, 12 +; VI-NEXT: s_lshl_b32 s1, s6, 12 ; VI-NEXT: s_or_b32 s0, s2, s0 -; VI-NEXT: s_or_b32 s1, s4, s1 -; VI-NEXT: s_cmp_lt_i32 s5, 1 -; VI-NEXT: s_cselect_b32 s6, s0, s1 -; VI-NEXT: s_and_b32 s2, s6, 7 +; VI-NEXT: s_or_b32 s1, s5, s1 +; VI-NEXT: s_cmp_lt_i32 s6, 1 +; VI-NEXT: s_cselect_b32 s7, s0, s1 +; VI-NEXT: s_and_b32 s2, s7, 7 ; VI-NEXT: s_cmp_gt_i32 s2, 5 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: s_cmp_eq_u32 s2, 3 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; VI-NEXT: s_lshr_b32 s2, s6, 2 +; VI-NEXT: s_lshr_b32 s2, s7, 2 ; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_addc_u32 s0, s2, 0 -; VI-NEXT: s_cmp_lt_i32 s5, 31 +; VI-NEXT: s_cmp_lt_i32 s6, 31 ; VI-NEXT: s_cselect_b32 s2, s0, 0x7c00 -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s5, 0 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-NEXT: v_lshlrev_b32_e32 v2, 9, v2 -; VI-NEXT: s_cmpk_eq_i32 s5, 0x40f +; VI-NEXT: s_cmpk_eq_i32 s6, 0x40f ; VI-NEXT: v_or_b32_e32 v2, 0x7c00, v2 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; VI-NEXT: s_movk_i32 s0, 0x7fff -; VI-NEXT: v_mov_b32_e32 v3, s8 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s7, 8 -; GFX9-NEXT: s_and_b32 s1, s7, 0x1ff -; GFX9-NEXT: s_and_b32 s2, s0, 0xffe -; GFX9-NEXT: s_or_b32 s0, s1, s6 -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: s_bfe_u32 s1, s7, 0xb0014 -; GFX9-NEXT: s_or_b32 s6, s2, s0 -; GFX9-NEXT: s_sub_i32 s2, 0x3f1, s1 -; GFX9-NEXT: v_med3_i32 v1, s2, 0, 13 -; GFX9-NEXT: s_or_b32 s0, s6, 0x1000 +; GFX9-NEXT: s_lshr_b32 s4, s3, 8 +; GFX9-NEXT: s_and_b32 s5, s3, 0x1ff +; GFX9-NEXT: s_and_b32 s7, s4, 0xffe +; GFX9-NEXT: s_or_b32 s2, s5, s2 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GFX9-NEXT: s_bfe_u32 s3, s3, 0xb0014 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: s_lshr_b32 s2, s0, s2 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1 +; GFX9-NEXT: s_sub_i32 s4, 0x3f1, s3 +; GFX9-NEXT: s_or_b32 s7, s7, s2 +; GFX9-NEXT: v_med3_i32 v1, s4, 0, 13 +; GFX9-NEXT: s_or_b32 s2, s7, 0x1000 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_lshr_b32 s4, s2, s4 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: s_add_i32 s7, s1, 0xfffffc10 -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: s_lshl_b32 s1, s7, 12 -; GFX9-NEXT: s_or_b32 s0, s2, s0 -; GFX9-NEXT: s_or_b32 s1, s6, s1 -; GFX9-NEXT: s_cmp_lt_i32 s7, 1 -; GFX9-NEXT: s_cselect_b32 s9, s0, s1 -; GFX9-NEXT: s_and_b32 s2, s9, 7 -; GFX9-NEXT: s_cmp_gt_i32 s2, 5 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 3 +; GFX9-NEXT: s_add_i32 s8, s3, 0xfffffc10 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: s_lshl_b32 s3, s8, 12 +; GFX9-NEXT: s_or_b32 s2, s4, s2 +; GFX9-NEXT: s_or_b32 s3, s7, s3 +; GFX9-NEXT: s_cmp_lt_i32 s8, 1 +; GFX9-NEXT: s_cselect_b32 s9, s2, s3 +; GFX9-NEXT: s_and_b32 s4, s9, 7 +; GFX9-NEXT: s_cmp_gt_i32 s4, 5 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s4, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; GFX9-NEXT: s_lshr_b32 s4, s9, 2 +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: s_addc_u32 s2, s4, 0 +; GFX9-NEXT: s_cmp_lt_i32 s8, 31 +; GFX9-NEXT: s_cselect_b32 s4, s2, 0x7c00 +; GFX9-NEXT: s_cmp_lg_u32 s7, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_lshr_b32 s2, s9, 2 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_addc_u32 s0, s2, 0 -; GFX9-NEXT: s_cmp_lt_i32 s7, 31 -; GFX9-NEXT: s_cselect_b32 s2, s0, 0x7c00 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 9, v1 -; GFX9-NEXT: s_cmpk_eq_i32 s7, 0x40f +; GFX9-NEXT: s_cmpk_eq_i32 s8, 0x40f ; GFX9-NEXT: v_or_b32_e32 v1, 0x7c00, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_movk_i32 s0, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s1, s7, 0x1ff -; GFX11-NEXT: s_lshr_b32 s2, s7, 8 -; GFX11-NEXT: s_or_b32 s1, s1, s6 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffe -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff +; GFX11-NEXT: s_lshr_b32 s6, s3, 8 +; GFX11-NEXT: s_or_b32 s2, s5, s2 +; GFX11-NEXT: s_and_b32 s5, s6, 0xffe +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX11-NEXT: s_bfe_u32 s1, s7, 0xb0014 -; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s1 -; GFX11-NEXT: s_addk_i32 s1, 0xfc10 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_bfe_u32 s2, s3, 0xb0014 +; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s2 +; GFX11-NEXT: s_addk_i32 s2, 0xfc10 ; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 ; GFX11-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-NEXT: s_lshl_b32 s7, s1, 12 +; GFX11-NEXT: s_lshl_b32 s7, s2, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s6, v1 -; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s5, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_or_b32 s3, s2, 0x1000 -; GFX11-NEXT: s_or_b32 s7, s2, s7 -; GFX11-NEXT: s_lshr_b32 s6, s3, s6 +; GFX11-NEXT: s_or_b32 s5, s3, 0x1000 +; GFX11-NEXT: s_or_b32 s7, s3, s7 +; GFX11-NEXT: s_lshr_b32 s6, s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s3, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-NEXT: s_or_b32 s3, s6, s3 -; GFX11-NEXT: s_cmp_lt_i32 s1, 1 -; GFX11-NEXT: s_cselect_b32 s3, s3, s7 +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: s_or_b32 s5, s6, s5 +; GFX11-NEXT: s_cmp_lt_i32 s2, 1 +; GFX11-NEXT: s_cselect_b32 s5, s5, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s6, s3, 7 +; GFX11-NEXT: s_and_b32 s6, s5, 7 ; GFX11-NEXT: s_cmp_gt_i32 s6, 5 ; GFX11-NEXT: s_cselect_b32 s7, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3 ; GFX11-NEXT: s_cselect_b32 s6, -1, 0 -; GFX11-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-NEXT: s_lshr_b32 s5, s5, 2 ; GFX11-NEXT: s_or_b32 s6, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s6, 0 -; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: s_cmp_lt_i32 s1, 31 -; GFX11-NEXT: s_cselect_b32 s3, s3, 0x7c00 -; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s1, 0x40f -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-NEXT: s_cmp_lt_i32 s2, 31 +; GFX11-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0 ; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0 -; GFX11-NEXT: global_store_b16 v1, v0, s[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %mag.trunc = fptrunc double %mag to half %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign) @@ -1761,7 +1760,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half> %arg_mag, <2 x half> %arg_sign) { ; SI-LABEL: s_copysign_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1785,7 +1784,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; VI-LABEL: s_copysign_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_movk_i32 s4, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1805,26 +1804,26 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; GFX9-LABEL: s_copysign_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: s_lshr_b32 s1, s7, 16 -; GFX9-NEXT: s_lshr_b32 s2, s6, 16 -; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s3 @@ -1848,107 +1847,106 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) { ; SI-LABEL: s_copysign_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: s_lshr_b32 s2, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: v_bfi_b32 v2, s2, v2, v3 +; SI-NEXT: s_lshr_b32 s6, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: s_lshr_b32 s0, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s2 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v2, s0, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v1, s2, v1, v5 -; SI-NEXT: v_bfi_b32 v0, s2, v0, v4 +; SI-NEXT: v_bfi_b32 v1, s0, v1, v5 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_copysign_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_lshr_b32 s3, s6, 16 -; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: v_bfi_b32 v0, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_bfi_b32 v1, s2, v1, v2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_lshr_b32 s2, s2, 16 +; VI-NEXT: s_lshr_b32 s0, s0, 16 +; VI-NEXT: v_bfi_b32 v0, s6, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_bfi_b32 v1, s6, v1, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 -; VI-NEXT: s_add_u32 s2, s0, 4 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_short v[0:1], v3 +; VI-NEXT: s_add_u32 s0, s4, 4 +; VI-NEXT: v_bfi_b32 v3, s6, v0, v1 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_short v[0:1], v3 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_copysign_v3f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_lshr_b32 s3, s6, 16 -; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v3 -; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 +; GFX9-NEXT: global_store_short v0, v2, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s6, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-NEXT: v_mov_b32_e32 v2, s7 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s5, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4 -; GFX11-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-NEXT: global_store_b16 v3, v2, s[4:5] offset:4 +; GFX11-NEXT: global_store_b32 v3, v0, s[4:5] ; GFX11-NEXT: s_endpgm %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign) store <3 x half> %out, ptr addrspace(1) %arg_out @@ -1958,124 +1956,124 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) { ; SI-LABEL: s_copysign_v4f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s8, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: s_lshr_b32 s9, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: s_lshr_b32 s8, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 +; SI-NEXT: s_lshr_b32 s0, s2, 16 +; SI-NEXT: s_lshr_b32 s9, s1, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; SI-NEXT: s_lshr_b32 s0, s3, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s3 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_bfi_b32 v1, s0, v1, v5 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 +; SI-NEXT: v_bfi_b32 v3, s0, v3, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v6 +; SI-NEXT: v_bfi_b32 v2, s0, v2, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_copysign_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_lshr_b32 s3, s7, 16 -; VI-NEXT: s_lshr_b32 s5, s5, 16 -; VI-NEXT: v_bfi_b32 v0, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshr_b32 s3, s3, 16 +; VI-NEXT: s_lshr_b32 s1, s1, 16 +; VI-NEXT: v_bfi_b32 v0, s6, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_bfi_b32 v1, s2, v1, v2 +; VI-NEXT: v_bfi_b32 v1, s6, v1, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_lshr_b32 s3, s6, 16 -; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: v_bfi_b32 v0, s2, v0, v2 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_bfi_b32 v2, s2, v2, v3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_lshr_b32 s0, s0, 16 +; VI-NEXT: v_bfi_b32 v0, s6, v0, v2 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_bfi_b32 v2, s6, v2, v3 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_copysign_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_lshr_b32 s3, s7, 16 -; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v3 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: s_lshr_b32 s3, s6, 16 -; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_bfi_b32 v3, s2, v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7 -; GFX11-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-NEXT: s_lshr_b32 s2, s7, 16 -; GFX11-NEXT: s_lshr_b32 s6, s6, 16 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s6 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s5, v0 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s4, v1 -; GFX11-NEXT: s_lshr_b32 s3, s5, 16 -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s3, v2 -; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s2, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX11-NEXT: s_lshr_b32 s6, s1, 16 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0 ; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4 -; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign) store <4 x half> %out, ptr addrspace(1) %arg_out diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index 43cf26c422a7cf..fab45c9dc3bc3c 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag, float %sign) { ; SI-LABEL: s_test_copysign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_brev_b32 s8, -2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; VI-LABEL: s_test_copysign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -34,7 +34,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; GFX11-LABEL: s_test_copysign_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -49,22 +49,22 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: s_and_b32 s4, s6, 0x7fffffff ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff +; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -74,10 +74,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m ; GFX11-LABEL: s_test_copysign_f32_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff +; GFX11-NEXT: s_bitset0_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -90,22 +90,22 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: s_and_b32 s4, s6, 0x7fffffff ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff +; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -115,10 +115,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m ; GFX11-LABEL: s_test_copysign_f32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff +; GFX11-NEXT: s_bitset0_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -131,22 +131,22 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_10.0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: s_and_b32 s4, s6, 0x7fffffff ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_10.0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff +; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -156,10 +156,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float ; GFX11-LABEL: s_test_copysign_f32_10.0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff +; GFX11-NEXT: s_bitset0_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -172,22 +172,22 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_or_b32 s4, s6, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -197,10 +197,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float ; GFX11-LABEL: s_test_copysign_f32_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_bitset1_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -213,22 +213,22 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_or_b32 s4, s6, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -238,10 +238,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_bitset1_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -254,22 +254,22 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0x80000000 +; SI-NEXT: s_and_b32 s4, s6, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -279,10 +279,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_0_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -296,12 +296,12 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0x80000000 +; SI-NEXT: s_and_b32 s4, s6, 0x80000000 ; SI-NEXT: s_or_b32 s4, s4, 1.0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -309,10 +309,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -323,10 +323,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -340,12 +340,12 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0x80000000 +; SI-NEXT: s_and_b32 s4, s6, 0x80000000 ; SI-NEXT: s_or_b32 s4, s4, 0x41200000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -353,10 +353,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; ; VI-LABEL: s_test_copysign_f32_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -367,10 +367,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; GFX11-LABEL: s_test_copysign_f32_10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -384,12 +384,12 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0x80000000 +; SI-NEXT: s_and_b32 s4, s6, 0x80000000 ; SI-NEXT: s_or_b32 s4, s4, 1.0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -397,10 +397,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; ; VI-LABEL: s_test_copysign_f32_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -411,10 +411,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; GFX11-LABEL: s_test_copysign_f32_neg1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -428,12 +428,12 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0x80000000 +; SI-NEXT: s_and_b32 s4, s6, 0x80000000 ; SI-NEXT: s_or_b32 s4, s4, 0x41200000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -441,10 +441,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -455,10 +455,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; GFX11-LABEL: s_test_copysign_f32_neg10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -472,50 +472,50 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) { ; SI-LABEL: s_test_copysign_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_brev_b32 s8, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_bfi_b32 v1, s8, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_bfi_b32 v0, s8, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_brev_b32 s6, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_bfi_b32 v0, s2, v2, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_bfi_b32 v1, s6, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_bfi_b32 v0, s6, v2, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) store <2 x float> %result, ptr addrspace(1) %out, align 8 @@ -525,40 +525,40 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x float> %mag, <3 x float> %sign) { ; SI-LABEL: s_test_copysign_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_bfi_b32 v2, s0, v2, v3 -; SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_bfi_b32 v0, s6, v0, v2 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s14 +; SI-NEXT: v_bfi_b32 v2, s6, v2, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s7, -2 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_bfi_b32 v2, s7, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_bfi_b32 v1, s7, v3, v0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v3, s8 -; VI-NEXT: v_bfi_b32 v0, s7, v0, v3 +; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: v_mov_b32_e32 v1, s14 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s13 +; VI-NEXT: v_bfi_b32 v1, s2, v3, v0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v3, s12 +; VI-NEXT: v_bfi_b32 v0, s2, v0, v3 ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] @@ -567,17 +567,17 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo ; GFX11-LABEL: s_test_copysign_v3f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_mov_b32_e32 v3, s12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s6, v0 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v1 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s10, v0 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v3 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s8, v3 ; GFX11-NEXT: global_store_b96 v4, v[0:2], s[0:1] ; GFX11-NEXT: s_endpgm %result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign) @@ -588,45 +588,45 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x float> %mag, <4 x float> %sign) { ; SI-LABEL: s_test_copysign_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_bfi_b32 v2, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: v_bfi_b32 v3, s6, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: v_bfi_b32 v2, s6, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_bfi_b32 v0, s6, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_brev_b32 s12, -2 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_bfi_b32 v3, s12, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_bfi_b32 v2, s12, v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_bfi_b32 v1, s12, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_bfi_b32 v0, s12, v0, v4 +; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v1, s15 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s14 +; VI-NEXT: v_bfi_b32 v2, s2, v2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_bfi_b32 v0, s2, v0, v4 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -635,18 +635,18 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo ; GFX11-LABEL: s_test_copysign_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s11 :: v_dual_mov_b32 v1, s10 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s8 +; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14 +; GFX11-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v5, s12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v0 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s6, v1 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s11, v0 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s10, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v4 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v5 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v4 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s8, v5 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) @@ -878,9 +878,9 @@ define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) { define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out, float %mag, double %sign) { ; SI-LABEL: s_test_copysign_f32_fptrunc_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -893,12 +893,12 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out ; ; VI-LABEL: s_test_copysign_f32_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_brev_b32 s0, -2 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_bfi_b32 v2, s0, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -908,11 +908,11 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out ; ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -928,7 +928,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -942,7 +942,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; ; VI-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s3, 0x80000000 @@ -954,7 +954,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; ; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -971,7 +971,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, float %mag, half %sign) { ; SI-LABEL: s_test_copysign_f32_fpext_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -986,7 +986,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -999,7 +999,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f32_fpext_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1016,12 +1016,12 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f32_1_fpext_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: v_or_b32_e32 v0, 1.0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1030,10 +1030,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out ; ; VI-LABEL: s_test_copysign_f32_1_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 16 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1045,10 +1045,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out ; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s4, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 @@ -1065,7 +1065,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, float %mag, bfloat %sign) { ; SI-LABEL: s_test_copysign_f32_fpext_bf16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1081,7 +1081,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f32_fpext_bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index 1bcc4132007100..5f75a2f29a026f 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -11,49 +11,49 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) #0 define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], double %sign) { ; SI-LABEL: s_test_copysign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfi_b32 v1, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x74 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s4, -2 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_bfi_b32 v1, s4, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x74 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x74 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double %sign) store double %result, ptr addrspace(1) %out, align 8 @@ -63,21 +63,21 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s5, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_and_b32 s4, s7, 0x7fffffff +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s1, 31 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -90,8 +90,8 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32 ; GFX11-LABEL: s_test_copysign_f64_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitset0_b32 s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -107,21 +107,21 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s5, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_and_b32 s4, s7, 0x7fffffff +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s1, 31 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -134,8 +134,8 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32 ; GFX11-LABEL: s_test_copysign_f64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitset0_b32 s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -151,21 +151,21 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s5, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_and_b32 s4, s7, 0x7fffffff +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s1, 31 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -178,8 +178,8 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3 ; GFX11-LABEL: s_test_copysign_f64_10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitset0_b32 s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -195,21 +195,21 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s5, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_or_b32 s4, s7, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s1, 31 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -222,8 +222,8 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x ; GFX11-LABEL: s_test_copysign_f64_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitset1_b32 s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -239,21 +239,21 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s5, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_or_b32 s4, s7, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s1, 31 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -266,8 +266,8 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x ; GFX11-LABEL: s_test_copysign_f64_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitset1_b32 s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -283,31 +283,31 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], float %sign) { ; SI-LABEL: s_test_copysign_f64_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dword s6, s[2:3], 0x1d -; SI-NEXT: s_brev_b32 s7, -2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dword s4, s[4:5], 0x1d +; SI-NEXT: s_brev_b32 s5, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_bfi_b32 v1, s5, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dword s4, s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; VI-NEXT: s_brev_b32 s5, -2 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dword s6, s[4:5], 0x74 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_bfi_b32 v1, s5, v0, v1 +; VI-NEXT: v_bfi_b32 v1, s4, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -316,12 +316,12 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i ; GFX11-LABEL: s_test_copysign_f64_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x74 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x74 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -336,14 +336,14 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], half %sign) { ; SI-LABEL: s_test_copysign_f64_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[2:3], 0x1d -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dword s2, s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13 +; SI-NEXT: s_brev_b32 s6, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_bfi_b32 v1, s6, v1, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -352,15 +352,15 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i ; ; VI-LABEL: s_test_copysign_f64_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; VI-NEXT: s_brev_b32 s5, -2 +; VI-NEXT: s_load_dword s6, s[4:5], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s4 +; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_bfi_b32 v1, s5, v1, v0 +; VI-NEXT: v_bfi_b32 v1, s4, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -369,12 +369,12 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i ; GFX11-LABEL: s_test_copysign_f64_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x74 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x74 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4 +; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -389,7 +389,7 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -403,7 +403,7 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub ; ; VI-LABEL: s_test_copysign_f64_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -415,7 +415,7 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub ; ; GFX11-LABEL: s_test_copysign_f64_0_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -430,7 +430,7 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -445,7 +445,7 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub ; ; VI-LABEL: s_test_copysign_f64_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -458,7 +458,7 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub ; ; GFX11-LABEL: s_test_copysign_f64_1_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -474,7 +474,7 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -489,7 +489,7 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou ; ; VI-LABEL: s_test_copysign_f64_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -502,7 +502,7 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou ; ; GFX11-LABEL: s_test_copysign_f64_10_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -518,7 +518,7 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -533,7 +533,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d ; ; VI-LABEL: s_test_copysign_f64_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -546,7 +546,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d ; ; GFX11-LABEL: s_test_copysign_f64_neg1_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -562,7 +562,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -577,7 +577,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f64_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -590,7 +590,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f64_neg10_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -606,38 +606,38 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %mag, <2 x double> %sign) { ; SI-LABEL: s_test_copysign_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: v_bfi_b32 v3, s6, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s8, -2 -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s8, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v1, s15 +; VI-NEXT: v_mov_b32_e32 v2, s9 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s13 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_bfi_b32 v1, s8, v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -645,16 +645,16 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou ; GFX11-LABEL: s_test_copysign_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s11 -; GFX11-NEXT: v_mov_b32_e32 v2, s9 -; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s15 +; GFX11-NEXT: v_mov_b32_e32 v2, s13 +; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s11, v1 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, s10 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) @@ -665,52 +665,52 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) { ; SI-LABEL: s_test_copysign_v3f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v1, s19 +; SI-NEXT: v_bfi_b32 v3, s6, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_bfi_b32 v5, s0, v0, v2 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[20:23], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_mov_b32_e32 v2, s21 +; SI-NEXT: v_bfi_b32 v5, s6, v0, v2 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v3f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s10, -2 -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s10, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v2, s9 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s17 +; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s13 +; VI-NEXT: v_mov_b32_e32 v2, s21 +; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 +; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_bfi_b32 v1, s10, v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v2, s17 ; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_bfi_b32 v5, s10, v0, v2 -; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -718,17 +718,17 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou ; GFX11-LABEL: s_test_copysign_v3f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s15 -; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: v_dual_mov_b32 v7, s13 :: v_dual_mov_b32 v4, s8 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s19 +; GFX11-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v0, s8 +; GFX11-NEXT: v_dual_mov_b32 v7, s17 :: v_dual_mov_b32 v4, s12 +; GFX11-NEXT: v_mov_b32_e32 v2, s10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s9, v5 -; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v7 +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s13, v5 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s11, v1 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v7 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] @@ -741,60 +741,60 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) { ; SI-LABEL: s_test_copysign_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x11 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11 +; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v2, s19 -; SI-NEXT: v_bfi_b32 v7, s0, v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, s19 +; SI-NEXT: v_bfi_b32 v3, s6, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_bfi_b32 v5, s0, v0, v2 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_mov_b32_e32 v2, s23 +; SI-NEXT: v_bfi_b32 v7, s6, v0, v2 +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_mov_b32_e32 v2, s21 +; SI-NEXT: v_bfi_b32 v5, s6, v0, v2 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s12, -2 -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s12, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v2, s9 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s17 +; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s15 +; VI-NEXT: v_mov_b32_e32 v2, s23 +; VI-NEXT: v_bfi_b32 v7, s2, v0, v2 ; VI-NEXT: v_mov_b32_e32 v0, s13 +; VI-NEXT: v_mov_b32_e32 v2, s21 +; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 ; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: v_bfi_b32 v1, s12, v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v2, s19 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_bfi_b32 v7, s12, v0, v2 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v2, s17 ; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_bfi_b32 v5, s12, v0, v2 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v6, s10 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -802,20 +802,20 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou ; GFX11-LABEL: s_test_copysign_v4f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s15 -; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v2, s10 -; GFX11-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v4, s4 -; GFX11-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v0, s8 +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s19 +; GFX11-NEXT: v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s14 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v4, s8 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, s7, v1 -; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s11, v3 +; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, s11, v1 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s15, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v9 -; GFX11-NEXT: v_mov_b32_e32 v6, s6 -; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s5, v5 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s13, v9 +; GFX11-NEXT: v_mov_b32_e32 v6, s10 +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s9, v5 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index d1676b13c12963..5b024a345edbab 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -10,23 +10,22 @@ define amdgpu_kernel void @v_fdiv_f16( ; SI-LABEL: v_fdiv_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, v2 +; SI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, v2 ; SI-NEXT: v_rcp_f32_e32 v5, v4 ; SI-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -41,26 +40,27 @@ define amdgpu_kernel void @v_fdiv_f16( ; SI-NEXT: v_div_fixup_f32 v2, v4, v3, v2 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: v_fdiv_f16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_ushort v2, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v2 ; GFX8-NEXT: v_rcp_f32_e32 v3, v0 @@ -72,7 +72,7 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX8-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc ; GFX8-NEXT: v_div_fixup_f16 v2, v3, v2, v5 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -80,13 +80,13 @@ define amdgpu_kernel void @v_fdiv_f16( ; ; GFX9-LABEL: v_fdiv_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 @@ -100,19 +100,19 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_fdiv_f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 @@ -126,21 +126,21 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v2, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_fdiv_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 @@ -160,7 +160,7 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v2, v1 -; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -181,7 +181,7 @@ entry: define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -212,7 +212,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX8-LABEL: v_rcp_f16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -229,29 +229,29 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX9-LABEL: v_rcp_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rcp_f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rcp_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -275,7 +275,7 @@ entry: define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16_abs: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -306,7 +306,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rcp_f16_abs: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -323,29 +323,29 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_abs: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e64 v1, |v1| -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rcp_f16_abs: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e64 v1, |v1| -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rcp_f16_abs: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -372,7 +372,7 @@ entry: define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: reciprocal_f16_rounded: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -403,7 +403,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX8-LABEL: reciprocal_f16_rounded: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -420,29 +420,29 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX9-LABEL: reciprocal_f16_rounded: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: reciprocal_f16_rounded: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: reciprocal_f16_rounded: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -466,7 +466,7 @@ entry: define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16_afn: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -484,7 +484,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rcp_f16_afn: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -501,29 +501,29 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_afn: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rcp_f16_afn: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rcp_f16_afn: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -547,7 +547,7 @@ entry: define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16_neg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -578,7 +578,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rcp_f16_neg: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -595,29 +595,29 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_neg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e64 v1, -v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rcp_f16_neg: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e64 v1, -v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rcp_f16_neg: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -641,7 +641,7 @@ entry: define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -675,7 +675,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX8-LABEL: v_rsq_f16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -692,29 +692,29 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX9-LABEL: v_rsq_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rsq_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rsq_f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rsq_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rsq_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -739,7 +739,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_neg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -773,7 +773,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rsq_f16_neg: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -791,31 +791,31 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rsq_f16_neg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rsq_f16_e32 v1, v1 ; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rsq_f16_neg: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rsq_f16_e32 v1, v1 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rsq_f16_neg: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -842,7 +842,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_multi_use: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -878,7 +878,7 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX8-LABEL: v_rsq_f16_multi_use: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -897,33 +897,33 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX9-LABEL: v_rsq_f16_multi_use: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rsq_f16_e32 v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v0, v2, s[4:5] +; GFX9-NEXT: global_store_short v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rsq_f16_multi_use: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rsq_f16_e32 v2, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_short v0, v2, s[4:5] +; GFX10-NEXT: global_store_short v0, v2, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rsq_f16_multi_use: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -951,7 +951,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_missing_contract0: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -985,7 +985,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX8-LABEL: v_rsq_f16_missing_contract0: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1003,31 +1003,31 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX9-LABEL: v_rsq_f16_missing_contract0: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX9-NEXT: v_rcp_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rsq_f16_missing_contract0: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX10-NEXT: v_rcp_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rsq_f16_missing_contract0: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1054,7 +1054,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_missing_contract1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX8-LABEL: v_rsq_f16_missing_contract1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1106,31 +1106,31 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX9-LABEL: v_rsq_f16_missing_contract1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX9-NEXT: v_rcp_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_rsq_f16_missing_contract1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX10-NEXT: v_rcp_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_rsq_f16_missing_contract1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1157,7 +1157,7 @@ entry: define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_neg_rsq_f16_missing_contract1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1191,7 +1191,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX8-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1209,31 +1209,31 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX9-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX9-NEXT: v_rcp_f16_e64 v1, -v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX10-NEXT: v_rcp_f16_e64 v1, -v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1260,47 +1260,47 @@ entry: define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_fdiv_f16_afn: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_rcp_f32_e32 v3, v3 ; SI-NEXT: v_mul_f32_e32 v2, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: v_fdiv_f16_afn: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_ushort v0, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_rcp_f16_e32 v2, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mul_f16_e32 v2, v5, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -1308,52 +1308,52 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; ; GFX9-LABEL: v_fdiv_f16_afn: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v2, v2 ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_fdiv_f16_afn: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v2, v2 ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_fdiv_f16_afn: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2 -; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1371,47 +1371,47 @@ entry: define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #2 { ; SI-LABEL: v_fdiv_f16_unsafe: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_rcp_f32_e32 v3, v3 ; SI-NEXT: v_mul_f32_e32 v2, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: v_fdiv_f16_unsafe: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_ushort v0, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_rcp_f16_e32 v2, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mul_f16_e32 v2, v5, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -1419,52 +1419,52 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; ; GFX9-LABEL: v_fdiv_f16_unsafe: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v2, v2 ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_fdiv_f16_unsafe: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v2, v2 ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_fdiv_f16_unsafe: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2 -; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1482,7 +1482,7 @@ entry: define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; SI-LABEL: div_afn_2_x_pat_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1497,7 +1497,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8-LABEL: div_afn_2_x_pat_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0.5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1509,7 +1509,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX9-LABEL: div_afn_2_x_pat_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0.5, v0 @@ -1520,7 +1520,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX10-LABEL: div_afn_2_x_pat_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v0, 0.5, v0 @@ -1531,7 +1531,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-LABEL: div_afn_2_x_pat_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0 @@ -1547,7 +1547,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; SI-LABEL: div_afn_k_x_pat_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1562,7 +1562,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8-LABEL: div_afn_k_x_pat_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0x2e66, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1574,7 +1574,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX9-LABEL: div_afn_k_x_pat_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 @@ -1585,7 +1585,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX10-LABEL: div_afn_k_x_pat_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 @@ -1596,7 +1596,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-LABEL: div_afn_k_x_pat_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 @@ -1612,7 +1612,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; SI-LABEL: div_afn_neg_k_x_pat_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1627,7 +1627,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8-LABEL: div_afn_neg_k_x_pat_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0xae66, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1639,7 +1639,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX9-LABEL: div_afn_neg_k_x_pat_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0xae66, v0 @@ -1650,7 +1650,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX10-LABEL: div_afn_neg_k_x_pat_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v0, 0xae66, v0 @@ -1661,7 +1661,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-LABEL: div_afn_neg_k_x_pat_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, 0xae66, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index f34739c5ca25b2..33910947e6fac8 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_ninf: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -42,7 +42,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_ninf: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -68,7 +68,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX7-LABEL: s_fdiv_f32_ninf: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -94,7 +94,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -118,10 +118,10 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -133,13 +133,13 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -179,7 +179,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_ieee: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -203,7 +203,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_ieee: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -227,7 +227,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX7-LABEL: s_fdiv_f32_ieee: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -251,7 +251,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ieee: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -273,26 +273,26 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ieee: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_ieee: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 @@ -330,7 +330,7 @@ entry: define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_25ulp_f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX67-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX67-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 @@ -349,7 +349,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX8-LABEL: s_fdiv_25ulp_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -366,21 +366,21 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX10-LABEL: s_fdiv_25ulp_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |s7| -; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0 -; GFX10-NEXT: v_mul_f32_e32 v1, s7, v0 +; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| +; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4 +; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, s6, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_25ulp_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| @@ -414,7 +414,7 @@ entry: define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX6-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -440,7 +440,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX7-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -459,7 +459,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX8-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_frexp_mant_f32_e32 v1, s3 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1 @@ -476,23 +476,23 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s7 -; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s7 -; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s6 -; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 +; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s3 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s2 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s2 ; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, v3, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX10-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_frexp_mant_f32_e32 v0, s3 ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 @@ -527,7 +527,7 @@ entry: define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX67-LABEL: s_fdiv_fast_ieee_f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -540,7 +540,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_fast_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -551,17 +551,17 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_fast_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s7 -; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_rcp_f32_e32 v0, s3 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_fast_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -589,7 +589,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -602,7 +602,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -613,17 +613,17 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s7 -; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_rcp_f32_e32 v0, s3 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -651,7 +651,7 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -664,7 +664,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX8-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -675,17 +675,17 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s7 -; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_rcp_f32_e32 v0, s3 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -713,7 +713,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_arcp_daz: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -739,7 +739,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_arcp_daz: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -765,7 +765,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX7-LABEL: s_fdiv_f32_arcp_daz: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -791,7 +791,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_daz: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -815,10 +815,10 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_daz: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -830,13 +830,13 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_arcp_daz: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -876,7 +876,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_f32_arcp_ninf: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -889,7 +889,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -900,17 +900,17 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s7 -; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_rcp_f32_e32 v0, s3 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_arcp_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -938,16 +938,16 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v2f32: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[8:9], s7, s7, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s5, v0, s5 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -956,13 +956,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[8:9], s6, s6, v2 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s7, v1 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s4, v0, s4 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s11, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 @@ -972,19 +972,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v3, v5, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v4, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s6, v2 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s10, v2 ; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-FASTFMA-NEXT: s_endpgm ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s0 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v3, 1.0 @@ -994,15 +995,14 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[0:1], s6, s6, v4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0 +; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s7, v0 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s3, v0 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v2, v5, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, v0, v5, v5 @@ -1012,23 +1012,22 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s6, v4 -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s2, v4 +; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_v2f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_div_scale_f32 v2, s[8:9], s7, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_div_scale_f32 v0, vcc, s5, v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s11 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -1037,13 +1036,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX7-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX7-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX7-NEXT: v_div_scale_f32 v3, s[8:9], s6, s6, v2 +; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2 ; GFX7-NEXT: v_rcp_f32_e32 v4, v3 -; GFX7-NEXT: v_div_fixup_f32 v1, v0, s7, v1 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_div_scale_f32 v0, vcc, s4, v0, s4 +; GFX7-NEXT: v_div_fixup_f32 v1, v0, s11, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4 @@ -1053,19 +1052,19 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX7-NEXT: v_fma_f32 v0, -v3, v5, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v4, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v0, s6, v2 +; GFX7-NEXT: v_div_fixup_f32 v0, v0, s10, v2 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v1, v3, 1.0 @@ -1075,13 +1074,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s6, s6, v4 +; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v5, v2 -; GFX8-NEXT: v_div_fixup_f32 v1, v1, s7, v0 +; GFX8-NEXT: v_div_fixup_f32 v1, v1, s3, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v0, -v2, v5, 1.0 ; GFX8-NEXT: v_fma_f32 v0, v0, v5, v5 @@ -1094,16 +1093,16 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_div_fixup_f32 v0, v0, s6, v4 +; GFX8-NEXT: v_div_fixup_f32 v0, v0, s2, v4 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v2f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s5 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 +; GFX10-NEXT: v_div_scale_f32 v0, s6, s3, s3, s1 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -1113,12 +1112,12 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v2, s0, s6, s6, s4 +; GFX10-NEXT: v_div_scale_f32 v2, s6, s2, s2, s0 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s5 -; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s6, s4 +; GFX10-NEXT: v_div_fixup_f32 v1, v0, s3, s1 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 @@ -1129,19 +1128,19 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s6, s4 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, s0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s5 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 +; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s1 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -1152,11 +1151,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_scale_f32 v2, null, s6, s6, s4 +; GFX11-NEXT: v_div_scale_f32 v2, null, s2, s2, s0 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: v_div_fixup_f32 v1, v0, s7, s5 -; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s6, s4 +; GFX11-NEXT: v_div_fixup_f32 v1, v0, s3, s1 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 @@ -1168,8 +1167,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s6, s4 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32: @@ -1194,58 +1193,58 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_v2f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX67-NEXT: s_mov_b32 s3, 0xf000 -; GFX67-NEXT: s_mov_b32 s2, -1 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX67-NEXT: s_mov_b32 s7, 0xf000 +; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s6 -; GFX67-NEXT: v_rcp_f32_e32 v1, s7 -; GFX67-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX67-NEXT: v_mul_f32_e32 v1, s5, v1 -; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX67-NEXT: v_rcp_f32_e32 v0, s2 +; GFX67-NEXT: v_rcp_f32_e32 v1, s3 +; GFX67-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX67-NEXT: v_mul_f32_e32 v1, s1, v1 +; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_ulp25_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s6 -; GFX8-NEXT: v_rcp_f32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, s5, v1 +; GFX8-NEXT: v_rcp_f32_e32 v0, s2 +; GFX8-NEXT: v_rcp_f32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, s1, v1 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_ulp25_v2f32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s6 -; GFX10-NEXT: v_rcp_f32_e32 v1, s7 -; GFX10-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, s5, v1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s2 +; GFX10-NEXT: v_rcp_f32_e32 v1, s3 +; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, s1, v1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s6 -; GFX11-NEXT: v_rcp_f32_e32 v1, s7 +; GFX11-NEXT: v_rcp_f32_e32 v0, s2 +; GFX11-NEXT: v_rcp_f32_e32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_ulp25_v2f32: @@ -1270,58 +1269,58 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX67-NEXT: s_mov_b32 s3, 0xf000 -; GFX67-NEXT: s_mov_b32 s2, -1 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX67-NEXT: s_mov_b32 s7, 0xf000 +; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s7 -; GFX67-NEXT: v_rcp_f32_e32 v2, s6 -; GFX67-NEXT: v_mul_f32_e32 v1, s5, v0 -; GFX67-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX67-NEXT: v_rcp_f32_e32 v0, s3 +; GFX67-NEXT: v_rcp_f32_e32 v2, s2 +; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0 +; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_v2f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s7 -; GFX8-NEXT: v_rcp_f32_e32 v2, s6 -; GFX8-NEXT: v_mul_f32_e32 v1, s5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_rcp_f32_e32 v0, s3 +; GFX8-NEXT: v_rcp_f32_e32 v2, s2 +; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v2f32_fast_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s7 -; GFX10-NEXT: v_rcp_f32_e32 v2, s6 -; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s3 +; GFX10-NEXT: v_rcp_f32_e32 v2, s2 +; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_fast_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s7 -; GFX11-NEXT: v_rcp_f32_e32 v2, s6 +; GFX11-NEXT: v_rcp_f32_e32 v0, s3 +; GFX11-NEXT: v_rcp_f32_e32 v2, s2 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32_fast_math: @@ -1346,58 +1345,58 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_arcp_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX67-NEXT: s_mov_b32 s3, 0xf000 -; GFX67-NEXT: s_mov_b32 s2, -1 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX67-NEXT: s_mov_b32 s7, 0xf000 +; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s7 -; GFX67-NEXT: v_rcp_f32_e32 v2, s6 -; GFX67-NEXT: v_mul_f32_e32 v1, s5, v0 -; GFX67-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX67-NEXT: v_rcp_f32_e32 v0, s3 +; GFX67-NEXT: v_rcp_f32_e32 v2, s2 +; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0 +; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_v2f32_arcp_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s7 -; GFX8-NEXT: v_rcp_f32_e32 v2, s6 -; GFX8-NEXT: v_mul_f32_e32 v1, s5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_rcp_f32_e32 v0, s3 +; GFX8-NEXT: v_rcp_f32_e32 v2, s2 +; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v2f32_arcp_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s7 -; GFX10-NEXT: v_rcp_f32_e32 v2, s6 -; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s3 +; GFX10-NEXT: v_rcp_f32_e32 v2, s2 +; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_arcp_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s7 -; GFX11-NEXT: v_rcp_f32_e32 v2, s6 +; GFX11-NEXT: v_rcp_f32_e32 v0, s3 +; GFX11-NEXT: v_rcp_f32_e32 v2, s2 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32_arcp_math: @@ -1422,7 +1421,7 @@ entry: define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v4f32: ; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-FASTFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-FASTFMA-NEXT: s_mov_b32 s11, 0xf000 @@ -1493,7 +1492,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v4f32: ; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -1564,7 +1563,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX7-LABEL: s_fdiv_v4f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 @@ -1635,7 +1634,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: s_fdiv_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1706,7 +1705,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: s_fdiv_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1768,7 +1767,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: s_fdiv_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1867,7 +1866,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX67-LABEL: s_fdiv_v4f32_fast_math: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX67-NEXT: s_mov_b32 s11, 0xf000 @@ -1886,7 +1885,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: s_fdiv_v4f32_fast_math: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -1905,7 +1904,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: s_fdiv_v4f32_fast_math: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1923,7 +1922,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: s_fdiv_v4f32_fast_math: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1973,7 +1972,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX67-LABEL: s_fdiv_v4f32_arcp_math: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX67-NEXT: s_mov_b32 s11, 0xf000 @@ -1992,7 +1991,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: s_fdiv_v4f32_arcp_math: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -2011,7 +2010,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: s_fdiv_v4f32_arcp_math: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -2029,7 +2028,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: s_fdiv_v4f32_arcp_math: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2079,8 +2078,8 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspace(1) %out, float %a) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -2102,13 +2101,13 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dword s6, s[4:5], 0xb ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 +; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s6, s6, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s6, 1.0 +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, -v0, v2, 1.0 @@ -2119,15 +2118,15 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s6, 1.0 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2149,11 +2148,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX8-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 -; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s2, s2, 1.0 +; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s2, 1.0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v2, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0 @@ -2164,7 +2163,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0 +; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2173,11 +2172,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX10-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s4, 1.0 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s2, s2, 1.0 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -2189,7 +2188,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2197,11 +2196,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -2214,7 +2213,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -2237,8 +2236,8 @@ entry: define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr addrspace(1) %out, float %a) #1 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -2258,13 +2257,13 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dword s6, s[4:5], 0xb ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 +; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s6, s6, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s6, 1.0 +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v2, v0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, -v0, v2, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, v3, v2, v2 @@ -2273,15 +2272,15 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, v4, v2, v3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s6, 1.0 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2301,11 +2300,11 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX8-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 -; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s2, s2, 1.0 +; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s2, 1.0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v2, v0 ; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0 ; GFX8-NEXT: v_fma_f32 v2, v3, v2, v2 @@ -2314,7 +2313,7 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX8-NEXT: v_fma_f32 v3, v4, v2, v3 ; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0 +; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2323,21 +2322,21 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX10-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s4, 1.0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s2, s2, 1.0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2345,22 +2344,22 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX11-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX11-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll index 8e43bd890a8fa4..da7cae86cf1eac 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -37,7 +37,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -52,7 +52,7 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -69,7 +69,7 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -102,7 +102,7 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -135,10 +135,10 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s0 ; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v2, s1 @@ -160,21 +160,21 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v5, s3 ; GCN-DENORM-NEXT: v_sub_u32_e32 v5, 0, v5 ; GCN-DENORM-NEXT: v_ldexp_f32 v3, v3, v5 -; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v4_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, s1 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, s2 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, s3 -; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-FLUSH-NEXT: s_endpgm %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %div = fdiv <4 x float> , %load, !fpmath !0 @@ -185,10 +185,10 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_minus_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s0 ; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v2, -s1 @@ -210,21 +210,21 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v5, s3 ; GCN-DENORM-NEXT: v_sub_u32_e32 v5, 0, v5 ; GCN-DENORM-NEXT: v_ldexp_f32 v3, v3, v5 -; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v4_minus_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0 ; GCN-FLUSH-NEXT: v_rcp_f32_e64 v1, -s1 ; GCN-FLUSH-NEXT: v_rcp_f32_e64 v2, -s2 ; GCN-FLUSH-NEXT: v_rcp_f32_e64 v3, -s3 -; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-FLUSH-NEXT: s_endpgm %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %div = fdiv <4 x float> , %load, !fpmath !0 @@ -235,10 +235,10 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s0 ; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v2, -s1 @@ -260,21 +260,21 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v5, s3 ; GCN-DENORM-NEXT: v_sub_u32_e32 v5, 0, v5 ; GCN-DENORM-NEXT: v_ldexp_f32 v3, v3, v5 -; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v4_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0 ; GCN-FLUSH-NEXT: v_rcp_f32_e64 v1, -s1 ; GCN-FLUSH-NEXT: v_rcp_f32_e64 v2, -s2 ; GCN-FLUSH-NEXT: v_rcp_f32_e64 v3, -s3 -; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-FLUSH-NEXT: s_endpgm %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %neg = fneg <4 x float> %load @@ -286,10 +286,10 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_minus_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s0 ; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v2, s1 @@ -311,21 +311,21 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg ; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v5, s3 ; GCN-DENORM-NEXT: v_sub_u32_e32 v5, 0, v5 ; GCN-DENORM-NEXT: v_ldexp_f32 v3, v3, v5 -; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v4_minus_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, s1 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, s2 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v3, s3 -; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-FLUSH-NEXT: s_endpgm %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %neg = fneg <4 x float> %load @@ -337,10 +337,10 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_c_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s0 ; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v2, s1 @@ -364,17 +364,17 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-NEXT: v_sub_u32_e32 v5, 2, v5 ; GCN-DENORM-NEXT: v_ldexp_f32 v0, v7, v0 ; GCN-DENORM-NEXT: v_ldexp_f32 v3, v3, v5 -; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v4_c_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 ; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc @@ -390,7 +390,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-FLUSH-NEXT: v_mul_f32_e32 v6, -2.0, v6 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v3, v0 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v6 -; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-FLUSH-NEXT: s_endpgm %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %div = fdiv <4 x float> , %load, !fpmath !0 @@ -401,10 +401,10 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_c_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v1, -s0 ; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v2, -s1 @@ -428,17 +428,17 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-NEXT: v_sub_u32_e32 v5, 2, v5 ; GCN-DENORM-NEXT: v_ldexp_f32 v0, v7, v0 ; GCN-DENORM-NEXT: v_ldexp_f32 v3, v3, v5 -; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v4_c_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0x2f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 ; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc @@ -456,7 +456,7 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v3, v0 ; GCN-FLUSH-NEXT: v_add_f32_e32 v3, v8, v8 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3 -; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-FLUSH-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-FLUSH-NEXT: s_endpgm %load = load <4 x float>, ptr addrspace(1) %arg, align 16 %neg = fneg <4 x float> %load @@ -468,17 +468,17 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) { ; GCN-DENORM-LABEL: div_v_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-DENORM-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 -; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s4 -; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v3, s4 +; GCN-DENORM-NEXT: s_load_dword s3, s[0:1], 0x0 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v3, s2 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s2 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s3 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v4, s2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v4, s3 ; GCN-DENORM-NEXT: v_sub_u32_e32 v2, v2, v4 ; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v3, v1 ; GCN-DENORM-NEXT: v_ldexp_f32 v1, v1, v2 @@ -487,19 +487,19 @@ define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) { ; ; GCN-FLUSH-LABEL: div_v_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-FLUSH-NEXT: s_load_dword s3, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0 ; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s2, v0 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s3, v0 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s2, v1 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-FLUSH-NEXT: global_store_dword v2, v0, s[0:1] ; GCN-FLUSH-NEXT: s_endpgm @@ -512,7 +512,7 @@ define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) { define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) { ; GCN-LABEL: div_1_by_x_fast: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -529,7 +529,7 @@ define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_x_fast: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -540,7 +540,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_fast: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -558,7 +558,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) { ; GCN-LABEL: div_1_by_minus_x_fast: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -576,7 +576,7 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_fast: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -587,7 +587,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_fast: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -606,7 +606,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -627,7 +627,7 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_1_by_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) @@ -656,7 +656,7 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -677,7 +677,7 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) % ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) @@ -706,7 +706,7 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) % define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_minus_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -727,7 +727,7 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) % ; ; GCN-FLUSH-LABEL: div_1_by_minus_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) @@ -757,7 +757,7 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) % define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -778,7 +778,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspac ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index 798cd6239d2621..c3694f3b92fb42 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -7,10 +7,10 @@ declare void @extern_func() #0 define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { ; FLAT_SCR_OPT-LABEL: stack_object_addrspacecast_in_kernel_no_calls: ; FLAT_SCR_OPT: ; %bb.0: -; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; FLAT_SCR_OPT-NEXT: s_add_u32 s8, s8, s13 +; FLAT_SCR_OPT-NEXT: s_addc_u32 s9, s9, 0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; FLAT_SCR_OPT-NEXT: s_mov_b64 s[0:1], src_private_base ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 0 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, s1 @@ -37,10 +37,10 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { define amdgpu_kernel void @stack_object_in_kernel_no_calls() { ; FLAT_SCR_OPT-LABEL: stack_object_in_kernel_no_calls: ; FLAT_SCR_OPT: ; %bb.0: -; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; FLAT_SCR_OPT-NEXT: s_add_u32 s8, s8, s13 +; FLAT_SCR_OPT-NEXT: s_addc_u32 s9, s9, 0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 0 ; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 0 ; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s0 @@ -62,43 +62,45 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() { define amdgpu_kernel void @kernel_calls_no_stack() { ; FLAT_SCR_OPT-LABEL: kernel_calls_no_stack: ; FLAT_SCR_OPT: ; %bb.0: -; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11 +; FLAT_SCR_OPT-NEXT: s_add_u32 s8, s8, s13 ; FLAT_SCR_OPT-NEXT: s_mov_b32 s32, 0 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s14, s10 -; FLAT_SCR_OPT-NEXT: s_mov_b64 s[10:11], s[4:5] +; FLAT_SCR_OPT-NEXT: s_addc_u32 s9, s9, 0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; FLAT_SCR_OPT-NEXT: s_mov_b64 s[8:9], s[4:5] ; FLAT_SCR_OPT-NEXT: s_getpc_b64 s[4:5] ; FLAT_SCR_OPT-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; FLAT_SCR_OPT-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; FLAT_SCR_OPT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; FLAT_SCR_OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s13, s9 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s12, s8 -; FLAT_SCR_OPT-NEXT: s_mov_b64 s[4:5], s[0:1] -; FLAT_SCR_OPT-NEXT: s_mov_b64 s[8:9], s[2:3] +; FLAT_SCR_OPT-NEXT: s_mov_b32 s14, s12 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s13, s11 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s12, s10 +; FLAT_SCR_OPT-NEXT: s_mov_b64 s[10:11], s[6:7] ; FLAT_SCR_OPT-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLAT_SCR_OPT-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLAT_SCR_OPT-NEXT: s_mov_b64 s[6:7], s[2:3] ; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_OPT-NEXT: s_swappc_b64 s[30:31], s[6:7] +; FLAT_SCR_OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] ; FLAT_SCR_OPT-NEXT: s_endpgm ; ; FLAT_SCR_ARCH-LABEL: kernel_calls_no_stack: ; FLAT_SCR_ARCH: ; %bb.0: -; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[10:11], s[4:5] +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s13, s9 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s12, s8 +; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[8:9], s[4:5] ; FLAT_SCR_ARCH-NEXT: s_getpc_b64 s[4:5] ; FLAT_SCR_ARCH-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; FLAT_SCR_ARCH-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; FLAT_SCR_ARCH-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; FLAT_SCR_ARCH-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s14, s8 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s14, s10 +; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[10:11], s[6:7] ; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[4:5], s[0:1] -; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[8:9], s[2:3] -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s12, s6 +; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[6:7], s[2:3] ; FLAT_SCR_ARCH-NEXT: v_or3_b32 v31, v0, v1, v2 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s13, s7 ; FLAT_SCR_ARCH-NEXT: s_mov_b32 s32, 0 ; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) ; FLAT_SCR_ARCH-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -111,10 +113,10 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: test: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_load_dword vcc_lo, s[2:3], 0x8 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_load_dword vcc_lo, s[4:5], 0x8 ; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GCN-NEXT: ; kill: killed $sgpr2_sgpr3 +; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_writelane_b32 v0, s0, 0 ; GCN-NEXT: v_writelane_b32 v0, s1, 1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 9d9d5b239a12c8..77eb9c495cead5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -14,7 +14,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 @@ -36,7 +36,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff1_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -57,7 +57,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff1_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -76,7 +76,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff1_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -97,7 +97,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff1_voff1: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -111,7 +111,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff1_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -144,7 +144,7 @@ bb: define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 @@ -166,7 +166,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff1_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 @@ -188,7 +188,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff1_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 @@ -208,7 +208,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff1_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 @@ -230,7 +230,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff1_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -246,7 +246,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff1_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -281,7 +281,7 @@ bb: define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 @@ -303,7 +303,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff1_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 @@ -325,7 +325,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff1_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 @@ -345,7 +345,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff1_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 @@ -367,7 +367,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff1_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -383,7 +383,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff1_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -418,7 +418,7 @@ bb: define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 @@ -441,7 +441,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff2_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -463,7 +463,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff2_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -484,7 +484,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff2_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -506,7 +506,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff2_voff1: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -521,7 +521,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff2_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -555,7 +555,7 @@ bb: define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 @@ -577,7 +577,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff2_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 @@ -600,7 +600,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff2_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 @@ -621,7 +621,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff2_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 @@ -644,7 +644,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff2_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -661,7 +661,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff2_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -697,7 +697,7 @@ bb: define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 @@ -719,7 +719,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff2_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 @@ -742,7 +742,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff2_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 @@ -763,7 +763,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff2_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 @@ -786,7 +786,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff2_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -803,7 +803,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff2_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -839,7 +839,7 @@ bb: define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 @@ -862,7 +862,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff4_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -884,7 +884,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff4_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -905,7 +905,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff4_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -927,7 +927,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff4_voff1: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -942,7 +942,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff4_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -976,7 +976,7 @@ bb: define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 @@ -998,7 +998,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff4_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 @@ -1021,7 +1021,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff4_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 @@ -1042,7 +1042,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff4_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 @@ -1065,7 +1065,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff4_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff4_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -1118,7 +1118,7 @@ bb: define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 @@ -1139,7 +1139,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff4_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 @@ -1162,7 +1162,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff4_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff4_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 @@ -1206,7 +1206,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff4_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff4_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 105174d7c9b3b7..8123f1270ab65d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -14,12 +14,12 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-LABEL: zero_init_kernel: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_mov_b32 s2, s0 ; GFX9-NEXT: s_mov_b32 s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -31,10 +31,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX10-LABEL: zero_init_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 @@ -83,18 +83,18 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX9-PAL-LABEL: zero_init_kernel: ; GFX9-PAL: ; %bb.0: -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX9-PAL-NEXT: s_mov_b32 s12, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 @@ -120,15 +120,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX1010-PAL-LABEL: zero_init_kernel: ; GFX1010-PAL: ; %bb.0: -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 @@ -145,15 +145,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX1030-PAL-LABEL: zero_init_kernel: ; GFX1030-PAL: ; %bb.0: -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 @@ -376,9 +376,9 @@ define void @zero_init_foo() { define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 @@ -392,11 +392,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 @@ -410,7 +410,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 @@ -424,7 +424,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 @@ -438,15 +438,15 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX9-PAL-LABEL: store_load_sindex_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX9-PAL-NEXT: s_mov_b32 s12, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 @@ -458,7 +458,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 @@ -472,16 +472,16 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX10-PAL-LABEL: store_load_sindex_kernel: ; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX10-PAL-NEXT: s_mov_b32 s10, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX10-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX10-PAL-NEXT: s_mov_b32 s12, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX10-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX10-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX10-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 @@ -495,7 +495,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 @@ -509,7 +509,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_kernel: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 @@ -673,9 +673,9 @@ bb: define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-LABEL: store_load_vindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: scratch_store_dword v1, v2, off @@ -687,10 +687,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX10-LABEL: store_load_vindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -727,17 +727,17 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX9-PAL-LABEL: store_load_vindex_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX9-PAL-NEXT: s_mov_b32 s12, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc @@ -758,15 +758,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX10-PAL-LABEL: store_load_vindex_kernel: ; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX10-PAL-NEXT: s_mov_b32 s10, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX10-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX10-PAL-NEXT: s_mov_b32 s12, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX10-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX10-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX10-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0 @@ -1034,8 +1034,8 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-LABEL: zero_init_small_offset_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1054,10 +1054,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX10-LABEL: zero_init_small_offset_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 @@ -1112,17 +1112,17 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX9-PAL-LABEL: zero_init_small_offset_kernel: ; GFX9-PAL: ; %bb.0: -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX9-PAL-NEXT: s_mov_b32 s12, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 @@ -1153,15 +1153,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1180,15 +1180,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: ; GFX1030-PAL: ; %bb.0: -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 @@ -1441,9 +1441,9 @@ define void @zero_init_small_offset_foo() { define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1461,11 +1461,11 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -1483,7 +1483,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 15 @@ -1501,7 +1501,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 @@ -1519,15 +1519,15 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX9-PAL-NEXT: s_mov_b32 s12, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0 -; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 @@ -1544,7 +1544,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_small_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -1562,16 +1562,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX1010-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1010-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1590,16 +1590,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX1030-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1030-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1617,7 +1617,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1635,7 +1635,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1873,8 +1873,8 @@ bb: define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-LABEL: store_load_vindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1890,10 +1890,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, off glc dlc @@ -1936,16 +1936,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX9-PAL-NEXT: s_mov_b32 s12, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x100, v0 @@ -1972,15 +1972,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 @@ -1996,15 +1996,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off glc dlc @@ -2228,8 +2228,8 @@ bb: define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX9-LABEL: zero_init_large_offset_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2249,10 +2249,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX10-LABEL: zero_init_large_offset_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 @@ -2309,17 +2309,17 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX9-PAL-LABEL: zero_init_large_offset_kernel: ; GFX9-PAL: ; %bb.0: -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX9-PAL-NEXT: s_mov_b32 s12, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 @@ -2352,15 +2352,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2380,15 +2380,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: ; GFX1030-PAL: ; %bb.0: -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 @@ -2695,9 +2695,9 @@ define void @zero_init_large_offset_foo() { define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2715,11 +2715,11 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -2737,7 +2737,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 15 @@ -2755,7 +2755,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 @@ -2773,15 +2773,15 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX9-PAL-NEXT: s_mov_b32 s12, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0 -; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 @@ -2798,7 +2798,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_large_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -2816,16 +2816,16 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX1010-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1010-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2844,16 +2844,16 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX1030-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1030-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2871,7 +2871,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2889,7 +2889,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -3127,8 +3127,8 @@ bb: define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-LABEL: store_load_vindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3144,10 +3144,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc @@ -3191,16 +3191,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX9-PAL-NEXT: s_mov_b32 s12, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 @@ -3228,15 +3228,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 @@ -3252,15 +3252,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc @@ -3488,8 +3488,8 @@ bb: define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-LABEL: store_load_large_imm_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 @@ -3505,10 +3505,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3800 @@ -3546,15 +3546,15 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX9-PAL-NEXT: s_mov_b32 s12, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 @@ -3581,15 +3581,15 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1010-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 @@ -3605,15 +3605,15 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX1030-PAL-NEXT: s_mov_b32 s12, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 @@ -3811,10 +3811,10 @@ bb: define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-LABEL: store_load_vidx_sidx_offset: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 @@ -3827,11 +3827,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX10-LABEL: store_load_vidx_sidx_offset: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s8, s8, s13 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -3844,7 +3844,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3858,7 +3858,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX12-LABEL: store_load_vidx_sidx_offset: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3871,16 +3871,16 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX9-PAL-NEXT: s_mov_b32 s12, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 @@ -3891,7 +3891,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -3906,16 +3906,16 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX10-PAL-NEXT: s_mov_b32 s10, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX10-PAL-NEXT: s_getpc_b64 s[12:13] +; GFX10-PAL-NEXT: s_mov_b32 s12, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX10-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX10-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-PAL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s12, s12, s11 +; GFX10-PAL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -3928,7 +3928,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3942,7 +3942,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX12-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index e0377ddad14c05..6727ebd10b92d5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -21,8 +21,8 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -36,12 +36,12 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -55,8 +55,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 0xffc ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -70,8 +70,8 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 0xffc ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -85,12 +85,12 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -104,8 +104,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -119,8 +119,8 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -134,14 +134,14 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -155,53 +155,53 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_add_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 16 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 16 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_add_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -214,18 +214,18 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -233,18 +233,18 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_add_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -252,13 +252,13 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_add_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -276,65 +276,65 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -348,8 +348,8 @@ entry: define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -361,8 +361,8 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -374,12 +374,12 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -392,49 +392,49 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_add_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_add_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -446,16 +446,16 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -463,16 +463,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_add_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -480,13 +480,13 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_add_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -503,61 +503,61 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_add_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -570,8 +570,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_and_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -585,8 +585,8 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -600,12 +600,12 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -619,53 +619,53 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_and_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 16 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 16 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_and_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -678,18 +678,18 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -697,18 +697,18 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_and_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -716,13 +716,13 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_and_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -740,65 +740,65 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -812,8 +812,8 @@ entry: define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_and_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -825,8 +825,8 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -838,12 +838,12 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -856,49 +856,49 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_and_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_and_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -910,16 +910,16 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -927,16 +927,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_and_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -944,13 +944,13 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_and_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -967,61 +967,61 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_and_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -1034,8 +1034,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -1049,8 +1049,8 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -1064,12 +1064,12 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -1083,53 +1083,53 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 16 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 16 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_sub_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -1142,18 +1142,18 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1161,18 +1161,18 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_sub_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1180,13 +1180,13 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_sub_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -1204,65 +1204,65 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -1276,8 +1276,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_sub_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -1289,8 +1289,8 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -1302,12 +1302,12 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -1320,49 +1320,49 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_sub_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -1374,16 +1374,16 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1391,16 +1391,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_sub_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1408,13 +1408,13 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_sub_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -1431,61 +1431,61 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_sub_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -1498,8 +1498,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_max_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -1512,8 +1512,8 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_max_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -1526,12 +1526,12 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_max_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm @@ -1544,52 +1544,52 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_max_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 16 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 16 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -1603,49 +1603,49 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -1662,64 +1662,64 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -1734,8 +1734,8 @@ entry: define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_max_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -1746,8 +1746,8 @@ define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_max_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -1758,12 +1758,12 @@ define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_max_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm @@ -1775,48 +1775,48 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_max_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -1829,45 +1829,45 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -1883,60 +1883,60 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -1950,8 +1950,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -1964,8 +1964,8 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_umax_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -1978,12 +1978,12 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_umax_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm @@ -1996,52 +1996,52 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 16 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 16 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -2055,49 +2055,49 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -2114,64 +2114,64 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -2186,8 +2186,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umax_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -2198,8 +2198,8 @@ define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_umax_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -2210,12 +2210,12 @@ define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_umax_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm @@ -2227,48 +2227,48 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -2281,45 +2281,45 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -2335,60 +2335,60 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -2402,8 +2402,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -2416,8 +2416,8 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -2430,12 +2430,12 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm @@ -2448,52 +2448,52 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_min_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 16 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 16 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -2507,49 +2507,49 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -2566,64 +2566,64 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -2638,8 +2638,8 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -2650,8 +2650,8 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -2662,12 +2662,12 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm @@ -2679,48 +2679,48 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_min_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -2733,45 +2733,45 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -2787,60 +2787,60 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -2854,8 +2854,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -2868,8 +2868,8 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_umin_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -2882,12 +2882,12 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_umin_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm @@ -2900,52 +2900,52 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 16 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 16 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -2959,49 +2959,49 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -3018,64 +3018,64 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -3090,8 +3090,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umin_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -3102,8 +3102,8 @@ define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_umin_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -3114,12 +3114,12 @@ define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_umin_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm @@ -3131,48 +3131,48 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -3185,45 +3185,45 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -3239,60 +3239,60 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -3306,8 +3306,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_or_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3321,8 +3321,8 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3336,12 +3336,12 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3355,53 +3355,53 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_or_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 16 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 16 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_or_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -3414,18 +3414,18 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3433,18 +3433,18 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; ; GCN2-LABEL: atomic_or_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3452,13 +3452,13 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; ; GCN3-LABEL: atomic_or_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -3476,65 +3476,65 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -3548,8 +3548,8 @@ entry: define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_or_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -3561,8 +3561,8 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -3574,12 +3574,12 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3592,49 +3592,49 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_or_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_or_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -3646,16 +3646,16 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3663,16 +3663,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; ; GCN2-LABEL: atomic_or_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3680,13 +3680,13 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; ; GCN3-LABEL: atomic_or_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -3703,61 +3703,61 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_or_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -3770,8 +3770,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3785,8 +3785,8 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3800,12 +3800,12 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3819,8 +3819,8 @@ entry: define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; GCN1-LABEL: atomic_xchg_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3834,8 +3834,8 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; ; GCN2-LABEL: atomic_xchg_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3849,12 +3849,12 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; ; GCN3-LABEL: atomic_xchg_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3868,53 +3868,53 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 16 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 16 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_xchg_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -3927,18 +3927,18 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3946,18 +3946,18 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; ; GCN2-LABEL: atomic_xchg_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3965,13 +3965,13 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; ; GCN3-LABEL: atomic_xchg_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -3989,65 +3989,65 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -4061,8 +4061,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -4074,8 +4074,8 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -4087,12 +4087,12 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4105,49 +4105,49 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_xchg_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -4159,16 +4159,16 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4176,16 +4176,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_xchg_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4193,13 +4193,13 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_xchg_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -4216,61 +4216,61 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_xchg_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -4285,7 +4285,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4300,7 +4300,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN2-LABEL: atomic_cmpxchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN3-LABEL: atomic_cmpxchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4334,56 +4334,56 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 16 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 16 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -4397,19 +4397,19 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xf ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4418,19 +4418,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; ; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x3c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4439,15 +4439,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; ; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s7, s[4:5], 0x3c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -4465,71 +4465,71 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s9, s[4:5], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s9, s[4:5], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s9 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s9, s[4:5], 0x44 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s9 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -4544,7 +4544,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 @@ -4557,7 +4557,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN2-LABEL: atomic_cmpxchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 @@ -4570,12 +4570,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN3-LABEL: atomic_cmpxchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4588,52 +4588,52 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -4646,17 +4646,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xf ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4665,17 +4665,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; ; GCN2-LABEL: atomic_cmpxchg_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x3c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4684,15 +4684,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; ; GCN3-LABEL: atomic_cmpxchg_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s7, s[4:5], 0x3c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 @@ -4709,67 +4709,67 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s9, s[4:5], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s9, s[4:5], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s9 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s9, s[4:5], 0x44 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s9 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -4783,8 +4783,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4798,8 +4798,8 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4813,12 +4813,12 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4832,53 +4832,53 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 16 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 16 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_xor_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -4891,18 +4891,18 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4910,18 +4910,18 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_xor_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4929,13 +4929,13 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_xor_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -4953,65 +4953,65 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5025,8 +5025,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xor_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5038,8 +5038,8 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5051,12 +5051,12 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -5069,49 +5069,49 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_xor_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5123,16 +5123,16 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5140,16 +5140,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_xor_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5157,13 +5157,13 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_xor_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -5180,61 +5180,61 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_xor_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5247,7 +5247,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5279,15 +5279,15 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5300,7 +5300,7 @@ entry: define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5314,7 +5314,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5328,15 +5328,15 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5348,10 +5348,10 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 @@ -5368,10 +5368,10 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 @@ -5388,19 +5388,19 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5414,10 +5414,10 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 @@ -5432,10 +5432,10 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -5450,19 +5450,19 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN3-LABEL: atomic_load_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5475,8 +5475,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5488,8 +5488,8 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5501,12 +5501,12 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -5518,8 +5518,8 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5529,8 +5529,8 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5540,12 +5540,12 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5556,47 +5556,47 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s2 +; GCN1-NEXT: s_addc_u32 s1, s1, s3 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s2 +; GCN2-NEXT: s_addc_u32 s1, s1, s3 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s2 +; GCN3-NEXT: s_addc_u32 s1, s1, s3 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -5609,43 +5609,43 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s2 +; GCN1-NEXT: s_addc_u32 s1, s1, s3 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s2 +; GCN2-NEXT: s_addc_u32 s1, s1, s3 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s2 +; GCN3-NEXT: s_addc_u32 s1, s1, s3 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5657,7 +5657,7 @@ entry: define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5673,7 +5673,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5689,15 +5689,15 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5710,7 +5710,7 @@ entry: define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5724,7 +5724,7 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5738,15 +5738,15 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5758,10 +5758,10 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 @@ -5778,10 +5778,10 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_f32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 @@ -5798,19 +5798,19 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_f32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5824,10 +5824,10 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 @@ -5842,10 +5842,10 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_f32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -5860,19 +5860,19 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN3-LABEL: atomic_load_f32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5885,8 +5885,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5898,8 +5898,8 @@ define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5911,12 +5911,12 @@ define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -5928,8 +5928,8 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5939,8 +5939,8 @@ define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_f32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5950,12 +5950,12 @@ define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5966,47 +5966,47 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s2 +; GCN1-NEXT: s_addc_u32 s1, s1, s3 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s2 +; GCN2-NEXT: s_addc_u32 s1, s1, s3 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s2 +; GCN3-NEXT: s_addc_u32 s1, s1, s3 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6019,43 +6019,43 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s2 +; GCN1-NEXT: s_addc_u32 s1, s1, s3 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s2 +; GCN2-NEXT: s_addc_u32 s1, s1, s3 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s2 +; GCN3-NEXT: s_addc_u32 s1, s1, s3 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6067,7 +6067,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6083,7 +6083,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i8_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6099,15 +6099,15 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6120,7 +6120,7 @@ entry: define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6134,7 +6134,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i8: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6148,15 +6148,15 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6168,11 +6168,11 @@ entry: define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 @@ -6180,18 +6180,18 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_byte v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_load_i8_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -6199,25 +6199,25 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_byte v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_load_i8_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s0, s6 +; GCN3-NEXT: s_addc_u32 s1, s1, s7 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6231,8 +6231,8 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6244,8 +6244,8 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i8_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6257,12 +6257,12 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6274,8 +6274,8 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6285,8 +6285,8 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i8: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6296,12 +6296,12 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6312,44 +6312,44 @@ entry: define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, s6 -; GCN1-NEXT: s_addc_u32 s1, s5, s7 +; GCN1-NEXT: s_add_u32 s0, s0, s2 +; GCN1-NEXT: s_addc_u32 s1, s1, s3 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_byte v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, s6 -; GCN2-NEXT: s_addc_u32 s1, s5, s7 +; GCN2-NEXT: s_add_u32 s0, s0, s2 +; GCN2-NEXT: s_addc_u32 s1, s1, s3 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_byte v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i8_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s0, s4, s6 -; GCN3-NEXT: s_addc_u32 s1, s5, s7 +; GCN3-NEXT: s_add_u32 s0, s0, s2 +; GCN3-NEXT: s_addc_u32 s1, s1, s3 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6362,7 +6362,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6378,7 +6378,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6394,15 +6394,15 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6415,7 +6415,7 @@ entry: define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6429,7 +6429,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6443,15 +6443,15 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6463,10 +6463,10 @@ entry: define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 @@ -6483,10 +6483,10 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i16_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 @@ -6503,19 +6503,19 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_i16_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6529,8 +6529,8 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6542,8 +6542,8 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6555,12 +6555,12 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6572,8 +6572,8 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6583,8 +6583,8 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6594,12 +6594,12 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6610,47 +6610,47 @@ entry: define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GCN1-NEXT: s_add_u32 s0, s0, s2 +; GCN1-NEXT: s_addc_u32 s1, s1, s3 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GCN2-NEXT: s_add_u32 s0, s0, s2 +; GCN2-NEXT: s_addc_u32 s1, s1, s3 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i16_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GCN3-NEXT: s_add_u32 s0, s0, s2 +; GCN3-NEXT: s_addc_u32 s1, s1, s3 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6663,8 +6663,8 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6676,8 +6676,8 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_f16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6689,12 +6689,12 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_f16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6706,8 +6706,8 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6717,8 +6717,8 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_f16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6728,12 +6728,12 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_f16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6744,8 +6744,8 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6755,8 +6755,8 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_bf16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6766,12 +6766,12 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm %gep = getelementptr bfloat, ptr %out, i64 8 @@ -6782,8 +6782,8 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6793,8 +6793,8 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_bf16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6804,12 +6804,12 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; ; GCN3-LABEL: atomic_store_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm store atomic bfloat %in, ptr %out seq_cst, align 2 @@ -6819,8 +6819,8 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6834,8 +6834,8 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6849,12 +6849,12 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -6868,8 +6868,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 0xffc ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6883,8 +6883,8 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 0xffc ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6898,12 +6898,12 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -6917,8 +6917,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6932,8 +6932,8 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6947,14 +6947,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -6968,53 +6968,53 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 16 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 16 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_inc_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -7027,18 +7027,18 @@ entry: define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7046,18 +7046,18 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_inc_i32_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7065,13 +7065,13 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_inc_i32_incr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -7089,65 +7089,65 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -7161,8 +7161,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -7174,8 +7174,8 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -7187,12 +7187,12 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7205,49 +7205,49 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_inc_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -7259,16 +7259,16 @@ entry: define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7276,16 +7276,16 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_inc_i32_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7293,13 +7293,13 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_inc_i32_incr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -7316,61 +7316,61 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_ret_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_inc_i32_ret_incr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -7383,8 +7383,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -7398,8 +7398,8 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -7413,12 +7413,12 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7432,8 +7432,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 0xffc ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -7447,8 +7447,8 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 0xffc ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -7462,12 +7462,12 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7481,8 +7481,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -7496,8 +7496,8 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -7511,14 +7511,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7532,53 +7532,53 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 16 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s0, 16 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 16 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s0, 16 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_dec_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -7591,18 +7591,18 @@ entry: define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7610,18 +7610,18 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_dec_i32_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7629,13 +7629,13 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_dec_i32_decr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -7653,65 +7653,65 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -7725,8 +7725,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -7738,8 +7738,8 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -7751,12 +7751,12 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7769,49 +7769,49 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_dec_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -7823,16 +7823,16 @@ entry: define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7840,16 +7840,16 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_dec_i32_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7857,13 +7857,13 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_dec_i32_decr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s2, s0 +; GCN3-NEXT: s_addc_u32 s1, s3, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -7880,61 +7880,61 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_ret_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_dec_i32_ret_decr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s8, s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s8 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -7947,7 +7947,7 @@ entry: define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -7963,7 +7963,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -7979,15 +7979,15 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm %gep = getelementptr half, ptr %in, i64 8 @@ -7999,7 +7999,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -8013,7 +8013,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -8027,15 +8027,15 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm %val = load atomic half, ptr %in seq_cst, align 2 @@ -8046,7 +8046,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -8062,7 +8062,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_bf16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -8078,15 +8078,15 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm %gep = getelementptr bfloat, ptr %in, i64 8 @@ -8098,7 +8098,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -8112,7 +8112,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_bf16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -8126,15 +8126,15 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm %val = load atomic bfloat, ptr %in seq_cst, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 9c2faf622623d6..9b11929de2c910 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -3823,7 +3823,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -3853,7 +3853,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -3883,13 +3883,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s7, 31 -; GCN3-NEXT: s_mov_b32 s0, s7 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_ashr_i32 s5, s3, 31 +; GCN3-NEXT: s_mov_b32 s4, s3 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -3897,7 +3897,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3918,14 +3918,14 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_ashr_i32 s7, s5, 31 -; GCN1-NEXT: s_mov_b32 s6, s5 -; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s6 -; GCN1-NEXT: s_addc_u32 s1, s1, s7 +; GCN1-NEXT: s_ashr_i32 s5, s7, 31 +; GCN1-NEXT: s_mov_b32 s4, s7 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 @@ -3936,7 +3936,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_i32_e32 v2, s4, v3 +; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3953,14 +3953,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s7, s5, 31 -; GCN2-NEXT: s_mov_b32 s6, s5 -; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s6 -; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_ashr_i32 s5, s7, 31 +; GCN2-NEXT: s_mov_b32 s4, s7 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -3971,7 +3971,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_i32_e32 v2, s4, v3 +; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3988,34 +3988,34 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s5, s7, 31 +; GCN3-NEXT: s_mov_b32 s4, s7 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -4029,7 +4029,7 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -4057,7 +4057,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -4085,13 +4085,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s7, 31 -; GCN3-NEXT: s_mov_b32 s0, s7 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_ashr_i32 s5, s3, 31 +; GCN3-NEXT: s_mov_b32 s4, s3 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] @@ -4099,7 +4099,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4119,14 +4119,14 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_ashr_i32 s7, s5, 31 -; GCN1-NEXT: s_mov_b32 s6, s5 -; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s6 -; GCN1-NEXT: s_addc_u32 s1, s1, s7 +; GCN1-NEXT: s_ashr_i32 s5, s7, 31 +; GCN1-NEXT: s_mov_b32 s4, s7 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] @@ -4135,7 +4135,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_i32_e32 v2, s4, v3 +; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4152,14 +4152,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s7, s5, 31 -; GCN2-NEXT: s_mov_b32 s6, s5 -; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s6 -; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_ashr_i32 s5, s7, 31 +; GCN2-NEXT: s_mov_b32 s4, s7 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] @@ -4168,7 +4168,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_i32_e32 v2, s4, v3 +; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4185,34 +4185,34 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s5, s7, 31 +; GCN3-NEXT: s_mov_b32 s4, s7 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -4966,7 +4966,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -4996,7 +4996,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -5026,13 +5026,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s7, 31 -; GCN3-NEXT: s_mov_b32 s0, s7 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_ashr_i32 s5, s3, 31 +; GCN3-NEXT: s_mov_b32 s4, s3 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -5040,7 +5040,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -5061,14 +5061,14 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_ashr_i32 s7, s5, 31 -; GCN1-NEXT: s_mov_b32 s6, s5 -; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s6 -; GCN1-NEXT: s_addc_u32 s1, s1, s7 +; GCN1-NEXT: s_ashr_i32 s5, s7, 31 +; GCN1-NEXT: s_mov_b32 s4, s7 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 @@ -5079,7 +5079,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_u32_e32 v2, s4, v3 +; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5096,14 +5096,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s7, s5, 31 -; GCN2-NEXT: s_mov_b32 s6, s5 -; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s6 -; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_ashr_i32 s5, s7, 31 +; GCN2-NEXT: s_mov_b32 s4, s7 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -5114,7 +5114,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_u32_e32 v2, s4, v3 +; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5131,34 +5131,34 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s5, s7, 31 +; GCN3-NEXT: s_mov_b32 s4, s7 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5172,14 +5172,14 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_ashr_i32 s7, s5, 31 -; GCN1-NEXT: s_mov_b32 s6, s5 -; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s6 -; GCN1-NEXT: s_addc_u32 s1, s1, s7 +; GCN1-NEXT: s_ashr_i32 s5, s7, 31 +; GCN1-NEXT: s_mov_b32 s4, s7 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] @@ -5188,7 +5188,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_u32_e32 v2, s4, v3 +; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5205,14 +5205,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s7, s5, 31 -; GCN2-NEXT: s_mov_b32 s6, s5 -; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s6 -; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_ashr_i32 s5, s7, 31 +; GCN2-NEXT: s_mov_b32 s4, s7 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] @@ -5221,7 +5221,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_u32_e32 v2, s4, v3 +; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5238,34 +5238,34 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s5, s7, 31 +; GCN3-NEXT: s_mov_b32 s4, s7 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6760,7 +6760,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -6790,7 +6790,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -6820,13 +6820,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s1, s7, 31 -; GCN3-NEXT: s_mov_b32 s0, s7 -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_ashr_i32 s5, s3, 31 +; GCN3-NEXT: s_mov_b32 s4, s3 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -6834,7 +6834,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -6855,14 +6855,14 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_ashr_i32 s7, s5, 31 -; GCN1-NEXT: s_mov_b32 s6, s5 -; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s6 -; GCN1-NEXT: s_addc_u32 s1, s1, s7 +; GCN1-NEXT: s_ashr_i32 s5, s7, 31 +; GCN1-NEXT: s_mov_b32 s4, s7 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 @@ -6873,7 +6873,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_min_i32_e32 v2, s4, v3 +; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6890,14 +6890,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s7, s5, 31 -; GCN2-NEXT: s_mov_b32 s6, s5 -; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s6 -; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_ashr_i32 s5, s7, 31 +; GCN2-NEXT: s_mov_b32 s4, s7 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -6908,7 +6908,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_min_i32_e32 v2, s4, v3 +; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6925,34 +6925,34 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s5, s7, 31 +; GCN3-NEXT: s_mov_b32 s4, s7 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6966,12 +6966,12 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6990,12 +6990,12 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7014,8 +7014,8 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -7024,7 +7024,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s4, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7043,14 +7043,14 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_ashr_i32 s7, s5, 31 -; GCN1-NEXT: s_mov_b32 s6, s5 -; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; GCN1-NEXT: s_add_u32 s0, s0, s6 -; GCN1-NEXT: s_addc_u32 s1, s1, s7 +; GCN1-NEXT: s_ashr_i32 s5, s7, 31 +; GCN1-NEXT: s_mov_b32 s4, s7 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] @@ -7059,7 +7059,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_min_i32_e32 v2, s4, v3 +; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7076,14 +7076,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s7, s5, 31 -; GCN2-NEXT: s_mov_b32 s6, s5 -; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s6 -; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_ashr_i32 s5, s7, 31 +; GCN2-NEXT: s_mov_b32 s4, s7 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] @@ -7092,7 +7092,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_min_i32_e32 v2, s4, v3 +; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7109,34 +7109,34 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s5, s7, 31 +; GCN3-NEXT: s_mov_b32 s4, s7 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index f2959dc19ba4c0..f5433ca4da4cd4 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -9,37 +9,37 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB0_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB0_4 ; GCN1-NEXT: .LBB0_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB0_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB0_2 ; GCN1-NEXT: .LBB0_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -47,7 +47,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s2, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen @@ -59,29 +59,29 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB0_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB0_4 ; GCN2-NEXT: .LBB0_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB0_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -94,9 +94,9 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen @@ -105,7 +105,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -150,23 +150,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB1_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -175,24 +175,24 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: .LBB1_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB1_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s1 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e32 v5, vcc, s0, v0 +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s4, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB1_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -202,23 +202,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB1_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -227,42 +227,42 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: .LBB1_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB1_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s4, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB1_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB1_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -272,15 +272,15 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB1_3: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB1_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -296,41 +296,41 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB2_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB2_4 ; GCN1-NEXT: .LBB2_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB2_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB2_2 ; GCN1-NEXT: .LBB2_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -338,7 +338,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s2, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen @@ -350,33 +350,33 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB2_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB2_4 ; GCN2-NEXT: .LBB2_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB2_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -389,9 +389,9 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen @@ -401,10 +401,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_add_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base @@ -447,18 +447,18 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -467,9 +467,9 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB3_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -479,23 +479,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB3_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e32 v5, vcc, s8, v0 +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s12, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB3_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -505,14 +505,14 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -521,9 +521,9 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB3_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -539,22 +539,22 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e32 v5, vcc, s8, v0 +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s12, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB3_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -602,44 +602,44 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 -; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB4_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB4_4 ; GCN1-NEXT: .LBB4_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB4_2 ; GCN1-NEXT: .LBB4_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s2, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen @@ -650,43 +650,43 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 -; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB4_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB4_4 ; GCN2-NEXT: .LBB4_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_cbranch_execnz .LBB4_2 ; GCN2-NEXT: .LBB4_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen @@ -695,7 +695,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s5 @@ -736,22 +736,22 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB5_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -760,24 +760,24 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB5_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB5_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s1 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e32 v5, vcc, s0, v0 +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s4, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB5_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -786,22 +786,22 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB5_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -810,41 +810,41 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB5_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB5_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s4, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB5_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB5_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -853,15 +853,15 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: .LBB5_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB5_3: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cselect_b32 s2, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB5_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -876,39 +876,39 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB6_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB6_4 ; GCN1-NEXT: .LBB6_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB6_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB6_2 ; GCN1-NEXT: .LBB6_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -916,7 +916,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s2, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen @@ -928,31 +928,31 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB6_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB6_4 ; GCN2-NEXT: .LBB6_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB6_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -965,9 +965,9 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen @@ -977,16 +977,15 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_add_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB6_3 @@ -1021,27 +1020,27 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB7_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1051,23 +1050,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB7_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e32 v5, vcc, s8, v0 +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s12, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB7_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -1077,23 +1076,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB7_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1109,22 +1108,22 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e32 v5, vcc, s8, v0 +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s12, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB7_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -1171,36 +1170,36 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB8_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB8_4 ; GCN1-NEXT: .LBB8_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB8_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB8_2 ; GCN1-NEXT: .LBB8_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -1208,9 +1207,9 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: v_and_b32_e32 v2, s2, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v3, s3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -1220,29 +1219,29 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB8_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB8_4 ; GCN2-NEXT: .LBB8_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB8_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1256,16 +1255,16 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: v_and_b32_e32 v2, s2, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v3, s3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -1310,23 +1309,23 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB9_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1335,23 +1334,23 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: .LBB9_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB9_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN1-NEXT: v_and_b32_e32 v4, s4, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v5, s1, v1 +; GCN1-NEXT: v_and_b32_e32 v5, s5, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB9_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -1361,23 +1360,23 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB9_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1386,41 +1385,41 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: .LBB9_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB9_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN2-NEXT: v_and_b32_e32 v4, s4, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v5, s1, v1 +; GCN2-NEXT: v_and_b32_e32 v5, s5, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB9_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB9_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1430,15 +1429,15 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB9_3: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v3, s1, v1 -; GFX12-NEXT: v_and_b32_e32 v2, s0, v0 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_and_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_and_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB9_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1454,40 +1453,40 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB10_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB10_4 ; GCN1-NEXT: .LBB10_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB10_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB10_2 ; GCN1-NEXT: .LBB10_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -1495,9 +1494,9 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: v_and_b32_e32 v2, s2, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v3, s3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -1507,33 +1506,33 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB10_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB10_4 ; GCN2-NEXT: .LBB10_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB10_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1547,9 +1546,9 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: v_and_b32_e32 v2, s2, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v3, s3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm @@ -1557,10 +1556,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_and_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base @@ -1603,18 +1602,18 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -1623,9 +1622,9 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB11_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1640,17 +1639,17 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_and_b32_e32 v4, s8, v0 +; GCN1-NEXT: v_and_b32_e32 v4, s12, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v5, s9, v1 -; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: v_and_b32_e32 v5, s13, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB11_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -1660,14 +1659,14 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -1676,9 +1675,9 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB11_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1695,20 +1694,20 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_and_b32_e32 v4, s8, v0 +; GCN2-NEXT: v_and_b32_e32 v4, s12, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v5, s9, v1 +; GCN2-NEXT: v_and_b32_e32 v5, s13, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB11_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -1756,45 +1755,45 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 -; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB12_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB12_4 ; GCN1-NEXT: .LBB12_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB12_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB12_2 ; GCN1-NEXT: .LBB12_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: v_and_b32_e32 v2, s2, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v3, s3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -1803,51 +1802,51 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 -; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB12_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB12_4 ; GCN2-NEXT: .LBB12_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB12_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_cbranch_execnz .LBB12_2 ; GCN2-NEXT: .LBB12_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: v_and_b32_e32 v2, s2, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v3, s3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s5 @@ -1888,22 +1887,22 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB13_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1912,23 +1911,23 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB13_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB13_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN1-NEXT: v_and_b32_e32 v4, s4, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v5, s1, v1 +; GCN1-NEXT: v_and_b32_e32 v5, s5, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB13_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -1937,22 +1936,22 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB13_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1961,40 +1960,40 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB13_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB13_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN2-NEXT: v_and_b32_e32 v4, s4, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v5, s1, v1 +; GCN2-NEXT: v_and_b32_e32 v5, s5, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB13_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB13_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2003,15 +2002,15 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: .LBB13_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB13_3: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cselect_b32 s2, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v3, s1, v1 -; GFX12-NEXT: v_and_b32_e32 v2, s0, v0 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_and_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_and_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB13_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -2026,38 +2025,38 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 -; GCN1-NEXT: s_cbranch_vccnz .LBB14_3 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB14_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB14_4 ; GCN1-NEXT: .LBB14_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB14_2 ; GCN1-NEXT: .LBB14_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -2065,9 +2064,9 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: v_and_b32_e32 v2, s2, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v3, s3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -2077,31 +2076,31 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB14_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB14_4 ; GCN2-NEXT: .LBB14_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -2115,9 +2114,9 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: v_and_b32_e32 v2, s2, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v3, s3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm @@ -2125,16 +2124,15 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_and_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB14_3 @@ -2169,27 +2167,27 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB15_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -2204,17 +2202,17 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_and_b32_e32 v4, s8, v0 +; GCN1-NEXT: v_and_b32_e32 v4, s12, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v5, s9, v1 -; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: v_and_b32_e32 v5, s13, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB15_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -2224,23 +2222,23 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB15_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -2257,20 +2255,20 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_and_b32_e32 v4, s8, v0 +; GCN2-NEXT: v_and_b32_e32 v4, s12, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v5, s9, v1 +; GCN2-NEXT: v_and_b32_e32 v5, s13, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB15_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -2317,37 +2315,37 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB16_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB16_4 ; GCN1-NEXT: .LBB16_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB16_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB16_2 ; GCN1-NEXT: .LBB16_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -2355,7 +2353,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen @@ -2367,29 +2365,29 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB16_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB16_4 ; GCN2-NEXT: .LBB16_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB16_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -2402,9 +2400,9 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s2, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen @@ -2413,7 +2411,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -2458,23 +2456,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB17_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -2483,24 +2481,24 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: .LBB17_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB17_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s1 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s0, v0 +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s4, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB17_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -2510,23 +2508,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB17_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -2535,42 +2533,42 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: .LBB17_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB17_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s0, v0 +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s4, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB17_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB17_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2580,15 +2578,15 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB17_3: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 -; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB17_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -2604,41 +2602,41 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB18_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB18_4 ; GCN1-NEXT: .LBB18_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB18_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB18_2 ; GCN1-NEXT: .LBB18_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -2646,7 +2644,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen @@ -2658,33 +2656,33 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB18_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB18_4 ; GCN2-NEXT: .LBB18_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB18_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -2697,9 +2695,9 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s2, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen @@ -2709,10 +2707,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_sub_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base @@ -2755,18 +2753,18 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -2775,9 +2773,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB19_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -2787,23 +2785,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB19_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s8, v0 +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s12, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB19_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -2813,14 +2811,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -2829,9 +2827,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB19_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -2847,22 +2845,22 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0 +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s12, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB19_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -2910,44 +2908,44 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 -; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB20_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB20_4 ; GCN1-NEXT: .LBB20_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB20_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB20_2 ; GCN1-NEXT: .LBB20_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen @@ -2958,43 +2956,43 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 -; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB20_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB20_4 ; GCN2-NEXT: .LBB20_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB20_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_cbranch_execnz .LBB20_2 ; GCN2-NEXT: .LBB20_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s2, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen @@ -3003,7 +3001,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s5 @@ -3044,22 +3042,22 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB21_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3068,24 +3066,24 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB21_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB21_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s1 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s0, v0 +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s4, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB21_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -3094,22 +3092,22 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB21_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3118,41 +3116,41 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB21_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB21_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s0, v0 +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s4, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB21_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB21_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -3161,15 +3159,15 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: .LBB21_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cselect_b32 s2, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 -; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -3184,39 +3182,39 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB22_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB22_4 ; GCN1-NEXT: .LBB22_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB22_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB22_2 ; GCN1-NEXT: .LBB22_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -3224,7 +3222,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen @@ -3236,31 +3234,31 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB22_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB22_4 ; GCN2-NEXT: .LBB22_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB22_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3273,9 +3271,9 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s2, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen @@ -3285,16 +3283,15 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_sub_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB22_3 @@ -3329,27 +3326,27 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB23_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3359,23 +3356,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB23_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s8, v0 +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s12, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB23_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -3385,23 +3382,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB23_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3417,22 +3414,22 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0 +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s12, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB23_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -3479,45 +3476,45 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB24_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB24_4 ; GCN1-NEXT: .LBB24_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB24_2 ; GCN1-NEXT: .LBB24_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -3529,29 +3526,29 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB24_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB24_4 ; GCN2-NEXT: .LBB24_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB24_2 @@ -3563,10 +3560,10 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -3575,7 +3572,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -3621,23 +3618,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB25_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB25_3 @@ -3645,25 +3642,25 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: .LBB25_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB25_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s0 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB25_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -3674,23 +3671,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB25_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB25_3 @@ -3698,24 +3695,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: .LBB25_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB25_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s0 -; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB25_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -3723,19 +3720,19 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_max_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB25_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -3745,16 +3742,16 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB25_3: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB25_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -3770,49 +3767,49 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB26_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB26_4 ; GCN1-NEXT: .LBB26_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB26_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB26_2 ; GCN1-NEXT: .LBB26_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -3824,33 +3821,33 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB26_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB26_4 ; GCN2-NEXT: .LBB26_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB26_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB26_2 @@ -3862,10 +3859,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -3875,10 +3872,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_max_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3922,18 +3919,18 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -3942,9 +3939,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB27_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB27_3 @@ -3953,24 +3950,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB27_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB27_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -3981,14 +3978,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -3997,9 +3994,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB27_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB27_3 @@ -4014,24 +4011,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB27_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -4080,44 +4077,44 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 -; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB28_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB28_4 ; GCN1-NEXT: .LBB28_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB28_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB28_2 ; GCN1-NEXT: .LBB28_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -4128,43 +4125,43 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 -; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB28_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB28_4 ; GCN2-NEXT: .LBB28_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB28_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB28_2 ; GCN2-NEXT: .LBB28_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -4173,7 +4170,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s5 @@ -4215,22 +4212,22 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB29_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB29_3 @@ -4238,25 +4235,25 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB29_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB29_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB29_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -4266,22 +4263,22 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB29_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB29_3 @@ -4289,24 +4286,24 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB29_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB29_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s0 -; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB29_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -4314,18 +4311,18 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_max_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB29_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -4334,16 +4331,16 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: .LBB29_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB29_3: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cselect_b32 s2, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB29_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -4358,47 +4355,47 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB30_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB30_4 ; GCN1-NEXT: .LBB30_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB30_2 ; GCN1-NEXT: .LBB30_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -4410,31 +4407,31 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB30_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB30_4 ; GCN2-NEXT: .LBB30_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB30_2 @@ -4446,10 +4443,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -4459,16 +4456,15 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_max_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB30_3 @@ -4504,27 +4500,27 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB31_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB31_3 @@ -4533,24 +4529,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB31_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB31_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -4561,23 +4557,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB31_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB31_3 @@ -4592,24 +4588,24 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB31_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -4657,45 +4653,45 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB32_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB32_4 ; GCN1-NEXT: .LBB32_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB32_2 ; GCN1-NEXT: .LBB32_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -4707,29 +4703,29 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB32_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB32_4 ; GCN2-NEXT: .LBB32_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB32_2 @@ -4741,10 +4737,10 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -4753,7 +4749,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -4799,23 +4795,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB33_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB33_3 @@ -4823,25 +4819,25 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: .LBB33_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB33_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s0 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB33_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -4852,23 +4848,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB33_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB33_3 @@ -4876,24 +4872,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: .LBB33_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB33_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s0 -; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB33_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -4901,19 +4897,19 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umax_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB33_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -4923,16 +4919,16 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB33_3: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB33_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -4948,49 +4944,49 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB34_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB34_4 ; GCN1-NEXT: .LBB34_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB34_2 ; GCN1-NEXT: .LBB34_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -5002,33 +4998,33 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB34_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB34_4 ; GCN2-NEXT: .LBB34_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB34_2 @@ -5040,10 +5036,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -5053,10 +5049,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_umax_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base @@ -5100,18 +5096,18 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -5120,9 +5116,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_cbranch_vccz .LBB35_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB35_3 @@ -5131,24 +5127,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB35_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB35_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -5159,14 +5155,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -5175,9 +5171,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_cbranch_vccz .LBB35_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB35_3 @@ -5192,24 +5188,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB35_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -5258,44 +5254,44 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 -; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB36_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB36_4 ; GCN1-NEXT: .LBB36_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB36_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB36_2 ; GCN1-NEXT: .LBB36_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -5306,43 +5302,43 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 -; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB36_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB36_4 ; GCN2-NEXT: .LBB36_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB36_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB36_2 ; GCN2-NEXT: .LBB36_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -5351,7 +5347,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s5 @@ -5393,22 +5389,22 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB37_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB37_3 @@ -5416,25 +5412,25 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB37_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB37_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB37_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -5444,22 +5440,22 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB37_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB37_3 @@ -5467,24 +5463,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB37_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB37_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s0 -; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB37_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -5492,18 +5488,18 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_umax_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB37_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -5512,16 +5508,16 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: .LBB37_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB37_3: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cselect_b32 s2, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB37_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -5536,47 +5532,47 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB38_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB38_4 ; GCN1-NEXT: .LBB38_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB38_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB38_2 ; GCN1-NEXT: .LBB38_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -5588,31 +5584,31 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB38_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB38_4 ; GCN2-NEXT: .LBB38_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB38_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB38_2 @@ -5624,10 +5620,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -5637,16 +5633,15 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_umax_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB38_3 @@ -5682,27 +5677,27 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB39_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB39_3 @@ -5711,24 +5706,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB39_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB39_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -5739,23 +5734,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB39_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB39_3 @@ -5770,24 +5765,24 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB39_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -5835,45 +5830,45 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB40_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB40_4 ; GCN1-NEXT: .LBB40_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB40_2 ; GCN1-NEXT: .LBB40_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -5885,29 +5880,29 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB40_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB40_4 ; GCN2-NEXT: .LBB40_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB40_2 @@ -5919,10 +5914,10 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -5931,7 +5926,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -5977,23 +5972,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB41_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB41_3 @@ -6001,25 +5996,25 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: .LBB41_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB41_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s0 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB41_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -6030,23 +6025,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB41_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB41_3 @@ -6054,24 +6049,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: .LBB41_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB41_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s0 -; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB41_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -6079,19 +6074,19 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_min_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB41_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -6101,16 +6096,16 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB41_3: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB41_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -6126,49 +6121,49 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB42_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB42_4 ; GCN1-NEXT: .LBB42_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB42_2 ; GCN1-NEXT: .LBB42_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -6180,33 +6175,33 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB42_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB42_4 ; GCN2-NEXT: .LBB42_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB42_2 @@ -6218,10 +6213,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -6231,10 +6226,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_min_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base @@ -6278,18 +6273,18 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -6298,9 +6293,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB43_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB43_3 @@ -6309,24 +6304,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB43_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB43_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -6337,14 +6332,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -6353,9 +6348,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB43_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB43_3 @@ -6370,24 +6365,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB43_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -6436,44 +6431,44 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 -; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB44_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB44_4 ; GCN1-NEXT: .LBB44_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB44_2 ; GCN1-NEXT: .LBB44_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -6484,43 +6479,43 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 -; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB44_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB44_4 ; GCN2-NEXT: .LBB44_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB44_2 ; GCN2-NEXT: .LBB44_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -6529,7 +6524,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s5 @@ -6571,22 +6566,22 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB45_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB45_3 @@ -6594,25 +6589,25 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB45_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB45_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB45_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -6622,22 +6617,22 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB45_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB45_3 @@ -6645,24 +6640,24 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB45_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB45_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s0 -; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB45_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -6670,18 +6665,18 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_min_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB45_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -6690,16 +6685,16 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: .LBB45_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB45_3: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cselect_b32 s2, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB45_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -6714,47 +6709,47 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB46_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB46_4 ; GCN1-NEXT: .LBB46_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB46_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB46_2 ; GCN1-NEXT: .LBB46_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -6766,31 +6761,31 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB46_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB46_4 ; GCN2-NEXT: .LBB46_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB46_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB46_2 @@ -6802,10 +6797,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -6815,16 +6810,15 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_min_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB46_3 @@ -6860,27 +6854,27 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB47_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB47_3 @@ -6889,24 +6883,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB47_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB47_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -6917,23 +6911,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB47_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB47_3 @@ -6948,24 +6942,24 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB47_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -7013,45 +7007,45 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB48_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB48_4 ; GCN1-NEXT: .LBB48_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB48_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB48_2 ; GCN1-NEXT: .LBB48_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -7063,29 +7057,29 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB48_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB48_4 ; GCN2-NEXT: .LBB48_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB48_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB48_2 @@ -7097,10 +7091,10 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -7109,7 +7103,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -7155,23 +7149,23 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB49_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB49_3 @@ -7179,25 +7173,25 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: .LBB49_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB49_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s0 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB49_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -7208,23 +7202,23 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB49_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB49_3 @@ -7232,24 +7226,24 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: .LBB49_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB49_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s0 -; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB49_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -7257,19 +7251,19 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umin_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB49_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -7279,16 +7273,16 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB49_3: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB49_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -7304,49 +7298,49 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB50_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB50_4 ; GCN1-NEXT: .LBB50_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB50_2 ; GCN1-NEXT: .LBB50_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -7358,33 +7352,33 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB50_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB50_4 ; GCN2-NEXT: .LBB50_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB50_2 @@ -7396,10 +7390,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -7409,10 +7403,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_umin_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base @@ -7456,18 +7450,18 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -7476,9 +7470,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_cbranch_vccz .LBB51_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB51_3 @@ -7487,24 +7481,24 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB51_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[12:13], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB51_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -7515,14 +7509,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -7531,9 +7525,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_cbranch_vccz .LBB51_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB51_3 @@ -7548,24 +7542,24 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[12:13], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB51_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -7614,44 +7608,44 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 -; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB52_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB52_4 ; GCN1-NEXT: .LBB52_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB52_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB52_2 ; GCN1-NEXT: .LBB52_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -7662,43 +7656,43 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 -; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB52_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB52_4 ; GCN2-NEXT: .LBB52_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB52_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB52_2 ; GCN2-NEXT: .LBB52_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -7707,7 +7701,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s5 @@ -7749,22 +7743,22 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB53_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB53_3 @@ -7772,25 +7766,25 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB53_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB53_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB53_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -7800,22 +7794,22 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB53_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB53_3 @@ -7823,24 +7817,24 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB53_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB53_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s0 -; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB53_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -7848,18 +7842,18 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_umin_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB53_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -7868,16 +7862,16 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: .LBB53_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB53_3: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cselect_b32 s2, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB53_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -7892,47 +7886,47 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB54_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB54_4 ; GCN1-NEXT: .LBB54_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB54_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execnz .LBB54_2 ; GCN1-NEXT: .LBB54_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -7944,31 +7938,31 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB54_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB54_4 ; GCN2-NEXT: .LBB54_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB54_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execnz .LBB54_2 @@ -7980,10 +7974,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -7993,16 +7987,15 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_umin_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB54_3 @@ -8034,31 +8027,31 @@ entry: %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst ret void } - -define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { -; GCN1-LABEL: atomic_umin_i64_ret_addr64: -; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 + +define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_umin_i64_ret_addr64: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB55_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_cbranch_execz .LBB55_3 @@ -8067,24 +8060,24 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB55_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[12:13], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB55_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -8095,23 +8088,23 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB55_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_cbranch_execz .LBB55_3 @@ -8126,24 +8119,24 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[12:13], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB55_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8191,36 +8184,36 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB56_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB56_4 ; GCN1-NEXT: .LBB56_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB56_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB56_2 ; GCN1-NEXT: .LBB56_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -8228,9 +8221,9 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: v_or_b32_e32 v2, s2, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v3, s3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -8240,29 +8233,29 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB56_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB56_4 ; GCN2-NEXT: .LBB56_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB56_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -8276,16 +8269,16 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: v_or_b32_e32 v2, s2, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v3, s3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -8330,23 +8323,23 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB57_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -8355,23 +8348,23 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GCN1-NEXT: .LBB57_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB57_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_or_b32_e32 v4, s0, v0 +; GCN1-NEXT: v_or_b32_e32 v4, s4, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_or_b32_e32 v5, s1, v1 +; GCN1-NEXT: v_or_b32_e32 v5, s5, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB57_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -8381,23 +8374,23 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB57_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -8406,41 +8399,41 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GCN2-NEXT: .LBB57_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB57_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_or_b32_e32 v4, s0, v0 +; GCN2-NEXT: v_or_b32_e32 v4, s4, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_or_b32_e32 v5, s1, v1 +; GCN2-NEXT: v_or_b32_e32 v5, s5, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB57_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB57_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -8450,15 +8443,15 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB57_3: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_or_b32_e32 v3, s1, v1 -; GFX12-NEXT: v_or_b32_e32 v2, s0, v0 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_or_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_or_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB57_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -8474,40 +8467,40 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB58_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB58_4 ; GCN1-NEXT: .LBB58_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB58_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB58_2 ; GCN1-NEXT: .LBB58_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -8515,9 +8508,9 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: v_or_b32_e32 v2, s2, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v3, s3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -8527,33 +8520,33 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB58_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB58_4 ; GCN2-NEXT: .LBB58_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB58_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -8567,9 +8560,9 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: v_or_b32_e32 v2, s2, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v3, s3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm @@ -8577,10 +8570,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-LABEL: atomic_or_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base @@ -8623,18 +8616,18 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -8643,9 +8636,9 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB59_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -8660,17 +8653,17 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_or_b32_e32 v4, s8, v0 +; GCN1-NEXT: v_or_b32_e32 v4, s12, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_or_b32_e32 v5, s9, v1 -; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: v_or_b32_e32 v5, s13, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB59_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -8680,14 +8673,14 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -8696,9 +8689,9 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB59_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -8715,20 +8708,20 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_or_b32_e32 v4, s8, v0 +; GCN2-NEXT: v_or_b32_e32 v4, s12, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_or_b32_e32 v5, s9, v1 +; GCN2-NEXT: v_or_b32_e32 v5, s13, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB59_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -8776,45 +8769,45 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 -; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB60_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB60_4 ; GCN1-NEXT: .LBB60_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB60_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB60_2 ; GCN1-NEXT: .LBB60_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: v_or_b32_e32 v2, s2, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v3, s3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -8823,51 +8816,51 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 -; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB60_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB60_4 ; GCN2-NEXT: .LBB60_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB60_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_cbranch_execnz .LBB60_2 ; GCN2-NEXT: .LBB60_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: v_or_b32_e32 v2, s2, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v3, s3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s5 @@ -8908,22 +8901,22 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB61_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -8932,23 +8925,23 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB61_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB61_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_or_b32_e32 v4, s0, v0 +; GCN1-NEXT: v_or_b32_e32 v4, s4, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_or_b32_e32 v5, s1, v1 +; GCN1-NEXT: v_or_b32_e32 v5, s5, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB61_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -8957,22 +8950,22 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB61_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -8981,40 +8974,40 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB61_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB61_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_or_b32_e32 v4, s0, v0 +; GCN2-NEXT: v_or_b32_e32 v4, s4, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_or_b32_e32 v5, s1, v1 +; GCN2-NEXT: v_or_b32_e32 v5, s5, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB61_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB61_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -9023,15 +9016,15 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: .LBB61_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB61_3: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cselect_b32 s2, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_or_b32_e32 v3, s1, v1 -; GFX12-NEXT: v_or_b32_e32 v2, s0, v0 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_or_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_or_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB61_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -9046,38 +9039,38 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB62_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB62_4 ; GCN1-NEXT: .LBB62_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB62_2 ; GCN1-NEXT: .LBB62_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -9085,9 +9078,9 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: v_or_b32_e32 v2, s2, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v3, s3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -9097,31 +9090,31 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB62_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB62_4 ; GCN2-NEXT: .LBB62_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -9135,9 +9128,9 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: v_or_b32_e32 v2, s2, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v3, s3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm @@ -9145,16 +9138,15 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-LABEL: atomic_or_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB62_3 @@ -9189,27 +9181,27 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB63_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -9224,17 +9216,17 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_or_b32_e32 v4, s8, v0 +; GCN1-NEXT: v_or_b32_e32 v4, s12, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_or_b32_e32 v5, s9, v1 -; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: v_or_b32_e32 v5, s13, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB63_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -9244,23 +9236,23 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB63_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -9277,20 +9269,20 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_or_b32_e32 v4, s8, v0 +; GCN2-NEXT: v_or_b32_e32 v4, s12, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_or_b32_e32 v5, s9, v1 +; GCN2-NEXT: v_or_b32_e32 v5, s13, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB63_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -9337,42 +9329,42 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB64_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB64_4 ; GCN1-NEXT: .LBB64_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB64_2 ; GCN1-NEXT: .LBB64_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -9382,29 +9374,29 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB64_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB64_4 ; GCN2-NEXT: .LBB64_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -9412,18 +9404,18 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: .LBB64_4: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -9464,42 +9456,42 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB65_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB65_4 ; GCN1-NEXT: .LBB65_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB65_2 ; GCN1-NEXT: .LBB65_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -9509,29 +9501,29 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB65_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB65_4 ; GCN2-NEXT: .LBB65_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -9539,18 +9531,18 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN2-NEXT: .LBB65_4: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -9591,42 +9583,42 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB66_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB66_4 ; GCN1-NEXT: .LBB66_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB66_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB66_2 ; GCN1-NEXT: .LBB66_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -9636,29 +9628,29 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB66_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB66_4 ; GCN2-NEXT: .LBB66_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB66_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -9666,18 +9658,18 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN2-NEXT: .LBB66_4: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -9719,23 +9711,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB67_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -9744,21 +9736,21 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: .LBB67_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB67_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v2, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB67_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(2) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -9769,23 +9761,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB67_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -9794,20 +9786,20 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: .LBB67_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB67_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v2, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB67_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(2) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -9815,19 +9807,19 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_xchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB67_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -9837,13 +9829,13 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB67_3: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB67_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -9860,46 +9852,46 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB68_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB68_4 ; GCN1-NEXT: .LBB68_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB68_2 ; GCN1-NEXT: .LBB68_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -9909,33 +9901,33 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB68_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB68_4 ; GCN2-NEXT: .LBB68_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -9943,11 +9935,11 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: .LBB68_4: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm @@ -9955,10 +9947,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_xchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base @@ -9998,18 +9990,18 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -10018,9 +10010,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_cbranch_vccz .LBB69_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -10030,20 +10022,20 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB69_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s8 +; GCN1-NEXT: v_mov_b32_e32 v4, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v2, s9 -; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s13 +; GCN1-NEXT: buffer_store_dword v2, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB69_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: s_waitcnt vmcnt(2) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -10054,14 +10046,14 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -10070,9 +10062,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_cbranch_vccz .LBB69_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -10088,20 +10080,20 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s12 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v2, s9 +; GCN2-NEXT: v_mov_b32_e32 v2, s13 ; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB69_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(2) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -10148,41 +10140,41 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 -; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB70_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB70_4 ; GCN1-NEXT: .LBB70_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB70_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB70_2 ; GCN1-NEXT: .LBB70_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -10191,47 +10183,47 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 -; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB70_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB70_4 ; GCN2-NEXT: .LBB70_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB70_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_cbranch_execnz .LBB70_2 ; GCN2-NEXT: .LBB70_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s5 @@ -10269,22 +10261,22 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB71_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -10293,21 +10285,21 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB71_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB71_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v2, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB71_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: s_waitcnt vmcnt(2) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -10317,22 +10309,22 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB71_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -10341,20 +10333,20 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB71_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB71_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v2, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB71_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: s_waitcnt vmcnt(2) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -10362,18 +10354,18 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_xchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB71_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -10382,13 +10374,13 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: .LBB71_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB71_3: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: s_cselect_b32 s2, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB71_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -10404,44 +10396,44 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB72_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB72_4 ; GCN1-NEXT: .LBB72_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB72_2 ; GCN1-NEXT: .LBB72_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s3 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -10451,31 +10443,31 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB72_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB72_4 ; GCN2-NEXT: .LBB72_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -10483,11 +10475,11 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: .LBB72_4: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s3 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm @@ -10495,16 +10487,15 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_xchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB72_3 @@ -10536,27 +10527,27 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB73_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -10566,20 +10557,20 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB73_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s8 +; GCN1-NEXT: v_mov_b32_e32 v4, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v2, s9 -; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s13 +; GCN1-NEXT: buffer_store_dword v2, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB73_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: s_waitcnt vmcnt(2) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm @@ -10590,23 +10581,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB73_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -10622,20 +10613,20 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s12 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v2, s9 +; GCN2-NEXT: v_mov_b32_e32 v2, s13 ; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB73_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: s_waitcnt vmcnt(2) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -10681,36 +10672,36 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB74_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB74_4 ; GCN1-NEXT: .LBB74_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB74_2 ; GCN1-NEXT: .LBB74_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -10718,9 +10709,9 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: v_xor_b32_e32 v2, s2, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v3, s3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -10730,29 +10721,29 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB74_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB74_4 ; GCN2-NEXT: .LBB74_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -10766,16 +10757,16 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: v_xor_b32_e32 v2, s2, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v3, s3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -10820,23 +10811,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB75_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -10845,23 +10836,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: .LBB75_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB75_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_xor_b32_e32 v4, s0, v0 +; GCN1-NEXT: v_xor_b32_e32 v4, s4, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_xor_b32_e32 v5, s1, v1 +; GCN1-NEXT: v_xor_b32_e32 v5, s5, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB75_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -10871,23 +10862,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB75_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -10896,41 +10887,41 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: .LBB75_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB75_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_xor_b32_e32 v4, s0, v0 +; GCN2-NEXT: v_xor_b32_e32 v4, s4, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_xor_b32_e32 v5, s1, v1 +; GCN2-NEXT: v_xor_b32_e32 v5, s5, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB75_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB75_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -10940,15 +10931,15 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB75_3: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_xor_b32_e32 v3, s1, v1 -; GFX12-NEXT: v_xor_b32_e32 v2, s0, v0 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_xor_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_xor_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB75_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -10964,40 +10955,40 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB76_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB76_4 ; GCN1-NEXT: .LBB76_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB76_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB76_2 ; GCN1-NEXT: .LBB76_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -11005,9 +10996,9 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: v_xor_b32_e32 v2, s2, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v3, s3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -11017,33 +11008,33 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB76_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB76_4 ; GCN2-NEXT: .LBB76_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB76_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -11057,9 +11048,9 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: v_xor_b32_e32 v2, s2, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v3, s3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm @@ -11067,10 +11058,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_xor_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base @@ -11113,18 +11104,18 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -11133,9 +11124,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB77_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -11150,17 +11141,17 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_xor_b32_e32 v4, s8, v0 +; GCN1-NEXT: v_xor_b32_e32 v4, s12, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_xor_b32_e32 v5, s9, v1 -; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: v_xor_b32_e32 v5, s13, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB77_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -11170,14 +11161,14 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -11186,9 +11177,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB77_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -11205,20 +11196,20 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_xor_b32_e32 v4, s8, v0 +; GCN2-NEXT: v_xor_b32_e32 v4, s12, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_xor_b32_e32 v5, s9, v1 +; GCN2-NEXT: v_xor_b32_e32 v5, s13, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB77_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -11266,45 +11257,45 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 -; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB78_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB78_4 ; GCN1-NEXT: .LBB78_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB78_2 ; GCN1-NEXT: .LBB78_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v1, s0 ; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: v_xor_b32_e32 v2, s2, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v3, s3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -11313,51 +11304,51 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 -; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB78_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB78_4 ; GCN2-NEXT: .LBB78_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_cbranch_execnz .LBB78_2 ; GCN2-NEXT: .LBB78_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: v_mov_b32_e32 v1, s0 ; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: v_xor_b32_e32 v2, s2, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v3, s3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s5 @@ -11398,22 +11389,22 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB79_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -11422,23 +11413,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB79_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB79_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_xor_b32_e32 v4, s0, v0 +; GCN1-NEXT: v_xor_b32_e32 v4, s4, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_xor_b32_e32 v5, s1, v1 +; GCN1-NEXT: v_xor_b32_e32 v5, s5, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB79_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -11447,22 +11438,22 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB79_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -11471,40 +11462,40 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB79_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB79_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_xor_b32_e32 v4, s0, v0 +; GCN2-NEXT: v_xor_b32_e32 v4, s4, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_xor_b32_e32 v5, s1, v1 +; GCN2-NEXT: v_xor_b32_e32 v5, s5, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB79_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB79_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -11513,15 +11504,15 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: .LBB79_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB79_3: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cselect_b32 s2, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_xor_b32_e32 v3, s1, v1 -; GFX12-NEXT: v_xor_b32_e32 v2, s0, v0 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: v_xor_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_xor_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB79_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -11536,38 +11527,38 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB80_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB80_4 ; GCN1-NEXT: .LBB80_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB80_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB80_2 ; GCN1-NEXT: .LBB80_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -11575,9 +11566,9 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: v_xor_b32_e32 v2, s2, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v3, s3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm @@ -11587,31 +11578,31 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB80_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB80_4 ; GCN2-NEXT: .LBB80_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB80_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -11625,9 +11616,9 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: v_xor_b32_e32 v2, s2, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v3, s3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm @@ -11635,16 +11626,15 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_xor_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB80_3 @@ -11679,27 +11669,27 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB81_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -11714,17 +11704,17 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_xor_b32_e32 v4, s8, v0 +; GCN1-NEXT: v_xor_b32_e32 v4, s12, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_xor_b32_e32 v5, s9, v1 -; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: v_xor_b32_e32 v5, s13, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB81_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -11734,23 +11724,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB81_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -11767,20 +11757,20 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_xor_b32_e32 v4, s8, v0 +; GCN2-NEXT: v_xor_b32_e32 v4, s12, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_xor_b32_e32 v5, s9, v1 +; GCN2-NEXT: v_xor_b32_e32 v5, s13, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB81_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -11824,7 +11814,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -11840,7 +11830,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -11856,7 +11846,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -11875,7 +11865,7 @@ entry: define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -11889,7 +11879,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -11903,7 +11893,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -11921,10 +11911,10 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -11941,10 +11931,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -11962,10 +11952,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-LABEL: atomic_load_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -11986,10 +11976,10 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 @@ -12004,10 +11994,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -12023,10 +12013,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-LABEL: atomic_load_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12046,7 +12036,7 @@ entry: define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_u32 s0, s2, 32 @@ -12059,7 +12049,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_u32 s0, s2, 32 @@ -12072,7 +12062,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -12088,7 +12078,7 @@ entry: define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -12099,7 +12089,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -12110,7 +12100,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -12125,34 +12115,34 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s6, s0 -; GCN1-NEXT: s_addc_u32 s1, s7, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s6, s0 -; GCN2-NEXT: s_addc_u32 s1, s7, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -12160,10 +12150,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-LABEL: atomic_store_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12181,30 +12171,30 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s6, s0 -; GCN1-NEXT: s_addc_u32 s1, s7, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s6, s0 -; GCN2-NEXT: s_addc_u32 s1, s7, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -12212,10 +12202,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-LABEL: atomic_store_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12236,48 +12226,48 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN1-NEXT: s_mov_b64 s[6:7], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB90_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB90_4 ; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s3 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s2 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB90_2 ; GCN1-NEXT: .LBB90_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -12290,47 +12280,47 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN2-NEXT: s_mov_b64 s[6:7], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB90_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB90_4 ; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s3 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s2 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_cbranch_execnz .LBB90_2 ; GCN2-NEXT: .LBB90_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -12340,40 +12330,40 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-LABEL: atomic_cmpxchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 -; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB90_3 ; GFX12-NEXT: ; %bb.1: ; %Flow -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB90_4 ; GFX12-NEXT: .LBB90_2: ; %atomicrmw.phi ; GFX12-NEXT: s_endpgm ; GFX12-NEXT: .LBB90_3: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB90_2 ; GFX12-NEXT: .LBB90_4: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s7, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s6, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[0:1], s2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -12388,48 +12378,48 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 0x11940 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 0x11940 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN1-NEXT: s_mov_b64 s[6:7], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB91_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB91_4 ; GCN1-NEXT: .LBB91_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB91_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s3 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s2 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB91_2 ; GCN1-NEXT: .LBB91_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -12442,47 +12432,47 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 0x11940 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 0x11940 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN2-NEXT: s_mov_b64 s[6:7], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB91_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB91_4 ; GCN2-NEXT: .LBB91_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB91_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s3 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s2 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_cbranch_execnz .LBB91_2 ; GCN2-NEXT: .LBB91_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -12492,40 +12482,40 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-LABEL: atomic_cmpxchg_i64_soffset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 0x11940 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x11940 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 -; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB91_3 ; GFX12-NEXT: ; %bb.1: ; %Flow -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB91_4 ; GFX12-NEXT: .LBB91_2: ; %atomicrmw.phi ; GFX12-NEXT: s_endpgm ; GFX12-NEXT: .LBB91_3: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB91_2 ; GFX12-NEXT: .LBB91_4: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s7, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s6, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[0:1], s2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 9000 @@ -12536,27 +12526,27 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_add_u32 s0, s8, 32 +; GCN1-NEXT: s_addc_u32 s1, s9, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB92_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s8 -; GCN1-NEXT: v_mov_b32_e32 v1, s9 -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: v_mov_b32_e32 v0, s12 +; GCN1-NEXT: v_mov_b32_e32 v1, s13 +; GCN1-NEXT: v_mov_b32_e32 v2, s14 +; GCN1-NEXT: v_mov_b32_e32 v3, s15 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12567,24 +12557,24 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB92_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB92_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -12594,23 +12584,23 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_add_u32 s0, s8, 32 +; GCN2-NEXT: s_addc_u32 s1, s9, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB92_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s8 -; GCN2-NEXT: v_mov_b32_e32 v1, s9 -; GCN2-NEXT: v_mov_b32_e32 v2, s10 -; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: v_mov_b32_e32 v0, s12 +; GCN2-NEXT: v_mov_b32_e32 v1, s13 +; GCN2-NEXT: v_mov_b32_e32 v2, s14 +; GCN2-NEXT: v_mov_b32_e32 v3, s15 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12627,23 +12617,23 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB92_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -12688,18 +12678,18 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[12:13], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -12714,10 +12704,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB93_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: v_mov_b32_e32 v0, s10 +; GCN1-NEXT: v_mov_b32_e32 v1, s11 +; GCN1-NEXT: v_mov_b32_e32 v2, s14 +; GCN1-NEXT: v_mov_b32_e32 v3, s15 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12725,21 +12715,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GCN1-NEXT: s_cbranch_execnz .LBB93_2 ; GCN1-NEXT: .LBB93_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: v_mov_b32_e32 v5, s10 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64_addr64_offset: @@ -12748,14 +12738,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[12:13], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -12770,10 +12760,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB93_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s10 -; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: v_mov_b32_e32 v0, s10 +; GCN2-NEXT: v_mov_b32_e32 v1, s11 +; GCN2-NEXT: v_mov_b32_e32 v2, s14 +; GCN2-NEXT: v_mov_b32_e32 v3, s15 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12787,10 +12777,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s10 +; GCN2-NEXT: v_mov_b32_e32 v4, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -12799,7 +12789,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -12850,25 +12840,25 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s18, -1 ; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 -; GCN1-NEXT: s_add_u32 s16, s16, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; GCN1-NEXT: s_load_dword s12, s[2:3], 0x43 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x43 ; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s2, s4, s2 -; GCN1-NEXT: s_addc_u32 s3, s5, s3 +; GCN1-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s2, s8, s2 +; GCN1-NEXT: s_addc_u32 s3, s9, s3 ; GCN1-NEXT: s_add_u32 s2, s2, 32 ; GCN1-NEXT: s_addc_u32 s3, s3, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s12 +; GCN1-NEXT: s_cmp_eq_u32 s3, s4 ; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB94_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s3 -; GCN1-NEXT: v_mov_b32_e32 v0, s8 -; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v0, s12 +; GCN1-NEXT: v_mov_b32_e32 v1, s13 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s2 @@ -12881,7 +12871,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB94_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 @@ -12889,7 +12879,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN1-NEXT: v_mov_b32_e32 v3, s2 ; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc @@ -12897,8 +12887,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB94_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -12908,25 +12898,25 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; GCN2-NEXT: s_load_dword s12, s[2:3], 0x10c +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x10c ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s2, s4, s2 -; GCN2-NEXT: s_addc_u32 s3, s5, s3 +; GCN2-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s2, s8, s2 +; GCN2-NEXT: s_addc_u32 s3, s9, s3 ; GCN2-NEXT: s_add_u32 s2, s2, 32 ; GCN2-NEXT: s_addc_u32 s3, s3, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s12 +; GCN2-NEXT: s_cmp_eq_u32 s3, s4 ; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB94_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s3 -; GCN2-NEXT: v_mov_b32_e32 v0, s8 -; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v0, s12 +; GCN2-NEXT: v_mov_b32_e32 v1, s13 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s2 @@ -12945,8 +12935,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN2-NEXT: v_mov_b32_e32 v3, s2 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc @@ -12954,29 +12944,29 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB94_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[10:11], s[10:11], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[10:11] +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[8:9], s[2:3] ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s3, s5 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_vccz .LBB94_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -12993,11 +12983,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e64 v3, v1, s9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v0, s8, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v1, s13, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v0, s12, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 ; GFX12-NEXT: .LBB94_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -13015,47 +13005,47 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN1-NEXT: s_mov_b64 s[6:7], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB95_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB95_4 ; GCN1-NEXT: .LBB95_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB95_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB95_2 ; GCN1-NEXT: .LBB95_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen @@ -13067,46 +13057,46 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN2-NEXT: s_mov_b64 s[6:7], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB95_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB95_4 ; GCN2-NEXT: .LBB95_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB95_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_cbranch_execnz .LBB95_2 ; GCN2-NEXT: .LBB95_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -13116,38 +13106,38 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GFX12-LABEL: atomic_cmpxchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB95_3 ; GFX12-NEXT: ; %bb.1: ; %Flow -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB95_4 ; GFX12-NEXT: .LBB95_2: ; %atomicrmw.phi ; GFX12-NEXT: s_endpgm ; GFX12-NEXT: .LBB95_3: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_cbranch_execnz .LBB95_2 ; GFX12-NEXT: .LBB95_4: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cselect_b32 s2, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s7, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s6, vcc_lo -; GFX12-NEXT: scratch_store_b64 off, v[0:1], s2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -13157,26 +13147,26 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s0, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cmp_eq_u32 s9, s0 ; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN1-NEXT: s_cbranch_vccz .LBB96_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_mov_b32_e32 v0, s8 -; GCN1-NEXT: v_mov_b32_e32 v1, s9 -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_mov_b32_e32 v4, s8 +; GCN1-NEXT: v_mov_b32_e32 v0, s12 +; GCN1-NEXT: v_mov_b32_e32 v1, s13 +; GCN1-NEXT: v_mov_b32_e32 v2, s14 +; GCN1-NEXT: v_mov_b32_e32 v3, s15 +; GCN1-NEXT: v_mov_b32_e32 v5, s9 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -13185,25 +13175,25 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GCN1-NEXT: .LBB96_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB96_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[8:9], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: s_cselect_b32 s0, s8, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB96_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -13213,22 +13203,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s0, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cmp_eq_u32 s9, s0 ; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN2-NEXT: s_cbranch_vccz .LBB96_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_mov_b32_e32 v0, s8 -; GCN2-NEXT: v_mov_b32_e32 v1, s9 -; GCN2-NEXT: v_mov_b32_e32 v2, s10 -; GCN2-NEXT: v_mov_b32_e32 v3, s11 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_mov_b32_e32 v4, s8 +; GCN2-NEXT: v_mov_b32_e32 v0, s12 +; GCN2-NEXT: v_mov_b32_e32 v1, s13 +; GCN2-NEXT: v_mov_b32_e32 v2, s14 +; GCN2-NEXT: v_mov_b32_e32 v3, s15 +; GCN2-NEXT: v_mov_b32_e32 v5, s9 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -13237,30 +13227,30 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GCN2-NEXT: .LBB96_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB96_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GCN2-NEXT: s_cselect_b32 s0, s8, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB96_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s9 @@ -13302,18 +13292,18 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[12:13], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -13326,10 +13316,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB97_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: v_mov_b32_e32 v0, s10 +; GCN1-NEXT: v_mov_b32_e32 v1, s11 +; GCN1-NEXT: v_mov_b32_e32 v2, s14 +; GCN1-NEXT: v_mov_b32_e32 v3, s15 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13337,21 +13327,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GCN1-NEXT: s_cbranch_execnz .LBB97_2 ; GCN1-NEXT: .LBB97_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: v_mov_b32_e32 v5, s10 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s11 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64_addr64: @@ -13360,14 +13350,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[12:13], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -13380,10 +13370,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB97_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s10 -; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: v_mov_b32_e32 v0, s10 +; GCN2-NEXT: v_mov_b32_e32 v1, s11 +; GCN2-NEXT: v_mov_b32_e32 v2, s14 +; GCN2-NEXT: v_mov_b32_e32 v3, s15 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13397,10 +13387,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s10 +; GCN2-NEXT: v_mov_b32_e32 v4, s11 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -13409,7 +13399,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -13458,23 +13448,23 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s18, -1 ; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 -; GCN1-NEXT: s_add_u32 s16, s16, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s12, s[2:3], 0x43 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x43 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11 ; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s2, s4, s2 -; GCN1-NEXT: s_addc_u32 s3, s5, s3 -; GCN1-NEXT: s_cmp_eq_u32 s3, s12 +; GCN1-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s2, s8, s2 +; GCN1-NEXT: s_addc_u32 s3, s9, s3 +; GCN1-NEXT: s_cmp_eq_u32 s3, s6 ; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB98_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s3 -; GCN1-NEXT: v_mov_b32_e32 v0, s8 -; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v0, s12 +; GCN1-NEXT: v_mov_b32_e32 v1, s13 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s2 @@ -13487,7 +13477,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB98_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 @@ -13495,7 +13485,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN1-NEXT: v_mov_b32_e32 v3, s2 ; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc @@ -13503,8 +13493,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB98_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -13514,23 +13504,23 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s12, s[2:3], 0x10c -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0x10c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s2, s4, s2 -; GCN2-NEXT: s_addc_u32 s3, s5, s3 -; GCN2-NEXT: s_cmp_eq_u32 s3, s12 +; GCN2-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s2, s8, s2 +; GCN2-NEXT: s_addc_u32 s3, s9, s3 +; GCN2-NEXT: s_cmp_eq_u32 s3, s6 ; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB98_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s3 -; GCN2-NEXT: v_mov_b32_e32 v0, s8 -; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v0, s12 +; GCN2-NEXT: v_mov_b32_e32 v1, s13 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s2 @@ -13549,8 +13539,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN2-NEXT: v_mov_b32_e32 v3, s2 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc @@ -13558,27 +13548,28 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB98_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GFX12-NEXT: s_mov_b64 s[10:11], src_private_base -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s11 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[8:9], s[2:3] +; GFX12-NEXT: s_cmp_eq_u32 s3, s5 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_vccz .LBB98_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -13595,11 +13586,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX12-NEXT: v_cndmask_b32_e64 v3, v1, s9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v0, s8, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v1, s13, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v0, s12, vcc_lo ; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 ; GFX12-NEXT: .LBB98_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -13613,7 +13604,7 @@ entry: define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -13629,7 +13620,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -13645,7 +13636,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -13664,7 +13655,7 @@ entry: define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -13678,7 +13669,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -13692,7 +13683,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -13710,10 +13701,10 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -13730,10 +13721,10 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_f64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -13751,10 +13742,10 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-LABEL: atomic_load_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13775,10 +13766,10 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 @@ -13793,10 +13784,10 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_f64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -13812,10 +13803,10 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-LABEL: atomic_load_f64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13835,7 +13826,7 @@ entry: define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GCN1-LABEL: atomic_store_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_u32 s0, s2, 32 @@ -13848,7 +13839,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_u32 s0, s2, 32 @@ -13861,7 +13852,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -13877,7 +13868,7 @@ entry: define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GCN1-LABEL: atomic_store_f64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -13888,7 +13879,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_f64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -13899,7 +13890,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -13914,34 +13905,34 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s6, s0 -; GCN1-NEXT: s_addc_u32 s1, s7, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s6, s0 -; GCN2-NEXT: s_addc_u32 s1, s7, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -13949,10 +13940,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-LABEL: atomic_store_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13970,30 +13961,30 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s6, s0 -; GCN1-NEXT: s_addc_u32 s1, s7, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GCN1-NEXT: s_add_u32 s0, s2, s0 +; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s6, s0 -; GCN2-NEXT: s_addc_u32 s1, s7, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GCN2-NEXT: s_add_u32 s0, s2, s0 +; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -14001,10 +13992,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-LABEL: atomic_store_f64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -14024,36 +14015,36 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB107_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB107_4 ; GCN1-NEXT: .LBB107_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB107_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB107_2 ; GCN1-NEXT: .LBB107_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -14064,7 +14055,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen @@ -14076,29 +14067,29 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB107_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB107_4 ; GCN2-NEXT: .LBB107_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB107_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -14115,7 +14106,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen @@ -14124,7 +14115,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -14172,23 +14163,23 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB108_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -14197,26 +14188,26 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: .LBB108_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB108_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB108_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -14226,23 +14217,23 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB108_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -14251,44 +14242,44 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: .LBB108_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB108_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB108_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB108_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -14298,18 +14289,18 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB108_3: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX12-NEXT: s_cselect_b32 s2, s2, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB108_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -14325,40 +14316,40 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB109_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB109_4 ; GCN1-NEXT: .LBB109_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB109_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB109_2 ; GCN1-NEXT: .LBB109_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -14369,7 +14360,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen @@ -14381,33 +14372,33 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB109_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB109_4 ; GCN2-NEXT: .LBB109_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB109_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -14424,7 +14415,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen @@ -14434,10 +14425,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_inc_i64_incr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base @@ -14483,18 +14474,18 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_ret_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -14503,9 +14494,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB110_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -14519,21 +14510,21 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB110_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -14543,14 +14534,14 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -14559,9 +14550,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB110_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -14581,20 +14572,20 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB110_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -14645,36 +14636,36 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 -; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB111_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB111_4 ; GCN1-NEXT: .LBB111_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB111_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB111_2 ; GCN1-NEXT: .LBB111_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen @@ -14684,7 +14675,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen @@ -14695,35 +14686,35 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 -; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB111_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB111_4 ; GCN2-NEXT: .LBB111_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB111_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_cbranch_execnz .LBB111_2 ; GCN2-NEXT: .LBB111_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen @@ -14733,7 +14724,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen @@ -14742,7 +14733,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_inc_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s5 @@ -14786,22 +14777,22 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB112_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -14810,26 +14801,26 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB112_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB112_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB112_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -14838,22 +14829,22 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB112_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -14862,43 +14853,43 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB112_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB112_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB112_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s3 -; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB112_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -14907,18 +14898,18 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: .LBB112_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB112_3: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cselect_b32 s2, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB112_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -14933,38 +14924,38 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB113_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB113_4 ; GCN1-NEXT: .LBB113_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB113_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB113_2 ; GCN1-NEXT: .LBB113_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 @@ -14975,7 +14966,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen @@ -14987,31 +14978,31 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB113_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB113_4 ; GCN2-NEXT: .LBB113_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB113_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -15028,7 +15019,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen @@ -15038,16 +15029,15 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_inc_i64_incr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB113_3 @@ -15085,27 +15075,27 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_ret_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB114_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -15119,21 +15109,21 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB114_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -15143,23 +15133,23 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB114_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -15179,20 +15169,20 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB114_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -15242,47 +15232,47 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s4, 32 -; GCN1-NEXT: s_addc_u32 s1, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB115_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB115_4 ; GCN1-NEXT: .LBB115_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB115_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB115_2 ; GCN1-NEXT: .LBB115_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] ; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] @@ -15297,29 +15287,29 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s4, 32 -; GCN2-NEXT: s_addc_u32 s1, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB115_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB115_4 ; GCN2-NEXT: .LBB115_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB115_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -15332,11 +15322,11 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] ; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] @@ -15348,7 +15338,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 @@ -15401,15 +15391,15 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 +; GCN1-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s4, 32 -; GCN1-NEXT: s_addc_u32 s3, s5, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_add_u32 s2, s8, 32 +; GCN1-NEXT: s_addc_u32 s3, s9, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s6 ; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB116_2 @@ -15448,8 +15438,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB116_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -15459,15 +15449,15 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s4, 32 -; GCN2-NEXT: s_addc_u32 s3, s5, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_add_u32 s2, s8, 32 +; GCN2-NEXT: s_addc_u32 s3, s9, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s6 ; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB116_2 @@ -15505,27 +15495,27 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB116_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], 32 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s9 -; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB116_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -15536,22 +15526,22 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: .LBB116_3: ; %atomicrmw.private ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX12-NEXT: s_cselect_b32 s4, s0, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_cselect_b32 s6, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s3, s0 -; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s2, s0 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s4 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 ; GFX12-NEXT: .LBB116_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -15567,51 +15557,51 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB117_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB117_4 ; GCN1-NEXT: .LBB117_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB117_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB117_2 ; GCN1-NEXT: .LBB117_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] ; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] @@ -15626,33 +15616,33 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB117_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB117_4 ; GCN2-NEXT: .LBB117_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB117_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -15665,11 +15655,11 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] ; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] @@ -15682,10 +15672,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_dec_i64_decr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base @@ -15736,18 +15726,18 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_ret_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 @@ -15756,9 +15746,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB118_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -15768,29 +15758,29 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB118_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(1) ; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], v[0:1] ; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] ; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB118_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -15800,14 +15790,14 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 @@ -15816,9 +15806,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB118_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -15834,13 +15824,13 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(1) ; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], v[0:1] ; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] ; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc @@ -15848,14 +15838,14 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB118_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) @@ -15911,46 +15901,46 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s0 -; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB119_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB119_4 ; GCN1-NEXT: .LBB119_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB119_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB119_2 ; GCN1-NEXT: .LBB119_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] ; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] @@ -15964,45 +15954,45 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s0 -; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB119_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB119_4 ; GCN2-NEXT: .LBB119_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB119_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_cbranch_execnz .LBB119_2 ; GCN2-NEXT: .LBB119_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] ; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] @@ -16014,7 +16004,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_dec_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_u32 s1, s5 @@ -16063,21 +16053,21 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cmp_eq_u32 s9, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB120_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16087,10 +16077,10 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB120_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB120_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[8:9], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s1 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: s_cselect_b32 s2, s8, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: s_add_i32 s2, s2, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s2 @@ -16109,8 +16099,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB120_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -16119,21 +16109,21 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cmp_eq_u32 s9, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB120_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s9 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16143,8 +16133,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB120_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB120_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GCN2-NEXT: s_cselect_b32 s2, s8, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: s_add_i32 s2, s2, 4 ; GCN2-NEXT: v_mov_b32_e32 v3, s2 @@ -16164,26 +16154,26 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB120_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s5, s1 -; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB120_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -16192,24 +16182,23 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: .LBB120_2: ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB120_3: ; %atomicrmw.private -; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cselect_b32 s4, s4, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s6, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] ; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s3, s0 -; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s2, s0 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s4 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 ; GFX12-NEXT: .LBB120_4: ; %atomicrmw.end -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -16224,49 +16213,49 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: s_cmp_eq_u32 s1, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB121_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB121_4 ; GCN1-NEXT: .LBB121_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB121_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_cbranch_execnz .LBB121_2 ; GCN1-NEXT: .LBB121_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] ; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] @@ -16281,31 +16270,31 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: s_cmp_eq_u32 s1, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB121_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB121_4 ; GCN2-NEXT: .LBB121_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB121_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -16318,11 +16307,11 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: v_mov_b32_e32 v5, s2 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] ; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] @@ -16335,16 +16324,15 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_dec_i64_decr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB121_3 @@ -16387,27 +16375,27 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_ret_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN1-NEXT: s_mov_b32 s14, -1 -; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s9 -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 -; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN1-NEXT: s_add_u32 s0, s4, s0 -; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB122_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -16417,29 +16405,29 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB122_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s0 -; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(1) ; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], v[0:1] ; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] ; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen ; GCN1-NEXT: .LBB122_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -16449,23 +16437,23 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s9 -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN2-NEXT: s_add_u32 s0, s4, s0 -; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB122_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -16481,13 +16469,13 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s8 -; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(1) ; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], v[0:1] ; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] ; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc @@ -16495,14 +16483,14 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB122_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll index d9c6e4ad5006ac..8991a062f37a4c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_add_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -21,7 +21,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_add_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -36,7 +36,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -53,14 +53,14 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_add_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -72,14 +72,14 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_add_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -92,15 +92,15 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_add_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -113,17 +113,17 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_add_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -132,17 +132,17 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GFX8-LABEL: atomic_add_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -152,10 +152,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_add_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -174,7 +174,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -195,7 +195,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX8-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -216,7 +216,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -240,7 +240,7 @@ entry: define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_add_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -253,7 +253,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_add_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -266,7 +266,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -282,50 +282,50 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_add_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_add_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -337,15 +337,15 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_add_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -354,15 +354,15 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GFX8-LABEL: atomic_add_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -372,10 +372,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_add_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -393,7 +393,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_add_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -412,7 +412,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_add_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -431,7 +431,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -454,7 +454,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_and_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -469,7 +469,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_and_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -484,7 +484,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -501,14 +501,14 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_and_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -520,14 +520,14 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_and_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -540,15 +540,15 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_and_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -561,17 +561,17 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_and_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -580,17 +580,17 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GFX8-LABEL: atomic_and_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -600,10 +600,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_and_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -622,7 +622,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -643,7 +643,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX8-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -664,7 +664,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -688,7 +688,7 @@ entry: define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_and_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -701,7 +701,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_and_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -714,7 +714,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -730,50 +730,50 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_and_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_and_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -785,15 +785,15 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_and_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -802,15 +802,15 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GFX8-LABEL: atomic_and_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -820,10 +820,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_and_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -841,7 +841,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_and_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -860,7 +860,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_and_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -879,7 +879,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -902,7 +902,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_sub_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -917,7 +917,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_sub_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -932,7 +932,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -949,14 +949,14 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_sub_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -968,14 +968,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_sub_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -988,15 +988,15 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_sub_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1009,17 +1009,17 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_sub_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1028,17 +1028,17 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GFX8-LABEL: atomic_sub_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1048,10 +1048,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_sub_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1070,7 +1070,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -1091,7 +1091,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX8-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1136,7 +1136,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_sub_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_sub_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1162,7 +1162,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1178,50 +1178,50 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_sub_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_sub_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1233,15 +1233,15 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_sub_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1250,15 +1250,15 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GFX8-LABEL: atomic_sub_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1268,10 +1268,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_sub_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1289,7 +1289,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_sub_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -1308,7 +1308,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_sub_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1327,7 +1327,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1350,7 +1350,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_max_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -1364,7 +1364,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_max_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -1378,7 +1378,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1395,14 +1395,14 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_max_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1414,14 +1414,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_max_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1434,15 +1434,15 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_max_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1455,17 +1455,17 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_max_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1473,17 +1473,17 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GFX8-LABEL: atomic_max_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1492,10 +1492,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_max_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1514,7 +1514,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -1535,7 +1535,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1556,7 +1556,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1580,7 +1580,7 @@ entry: define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_max_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_max_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1604,7 +1604,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1620,34 +1620,34 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_max_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -1655,15 +1655,15 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_max_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1675,15 +1675,15 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_max_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1691,15 +1691,15 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GFX8-LABEL: atomic_max_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1708,10 +1708,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_max_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1729,7 +1729,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_max_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -1748,7 +1748,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_max_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1767,7 +1767,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1790,7 +1790,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_umax_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -1804,7 +1804,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_umax_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -1818,7 +1818,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1835,14 +1835,14 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_umax_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1854,14 +1854,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; ; GFX8-LABEL: atomic_umax_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1874,15 +1874,15 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umax_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1895,17 +1895,17 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umax_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1913,17 +1913,17 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GFX8-LABEL: atomic_umax_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1932,10 +1932,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_umax_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1954,7 +1954,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -1975,7 +1975,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1996,7 +1996,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2020,7 +2020,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_umax_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2032,7 +2032,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_umax_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2044,7 +2044,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2060,34 +2060,34 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_umax_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -2095,15 +2095,15 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_umax_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -2115,15 +2115,15 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umax_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2131,15 +2131,15 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GFX8-LABEL: atomic_umax_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2148,10 +2148,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_umax_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -2169,7 +2169,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umax_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2188,7 +2188,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2207,7 +2207,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2230,7 +2230,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_min_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2244,7 +2244,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_min_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -2258,7 +2258,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2275,14 +2275,14 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_min_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2294,14 +2294,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_min_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2314,15 +2314,15 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_min_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -2335,17 +2335,17 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_min_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2353,17 +2353,17 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GFX8-LABEL: atomic_min_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2372,10 +2372,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_min_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -2394,7 +2394,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2415,7 +2415,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2436,7 +2436,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2460,7 +2460,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_min_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2472,7 +2472,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_min_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2484,7 +2484,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2500,34 +2500,34 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_min_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -2535,15 +2535,15 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_min_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -2555,15 +2555,15 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_min_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2571,15 +2571,15 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GFX8-LABEL: atomic_min_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2588,10 +2588,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_min_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -2609,7 +2609,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_min_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2628,7 +2628,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_min_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2647,7 +2647,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2670,7 +2670,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_umin_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2684,7 +2684,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_umin_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -2698,7 +2698,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2715,14 +2715,14 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_umin_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2734,14 +2734,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; ; GFX8-LABEL: atomic_umin_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2754,15 +2754,15 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umin_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -2775,17 +2775,17 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umin_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2793,17 +2793,17 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GFX8-LABEL: atomic_umin_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2812,10 +2812,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_umin_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -2834,7 +2834,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2855,7 +2855,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX8-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2876,7 +2876,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2900,7 +2900,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_umin_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2912,7 +2912,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_umin_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2924,7 +2924,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2940,34 +2940,34 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_umin_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umin_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -2975,15 +2975,15 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_umin_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -2995,15 +2995,15 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umin_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3011,15 +3011,15 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GFX8-LABEL: atomic_umin_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3028,10 +3028,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_umin_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3049,7 +3049,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umin_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -3068,7 +3068,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX8-LABEL: atomic_umin_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3087,7 +3087,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3110,7 +3110,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_or_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3125,7 +3125,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_or_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -3140,7 +3140,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3157,14 +3157,14 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_or_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3176,14 +3176,14 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; ; GFX8-LABEL: atomic_or_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3196,15 +3196,15 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-LABEL: atomic_or_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -3217,17 +3217,17 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_or_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3236,17 +3236,17 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; ; GFX8-LABEL: atomic_or_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3256,10 +3256,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-LABEL: atomic_or_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3278,7 +3278,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -3299,7 +3299,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX8-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3320,7 +3320,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3344,7 +3344,7 @@ entry: define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_or_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -3357,7 +3357,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_or_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3370,7 +3370,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3386,50 +3386,50 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_or_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_or_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -3441,15 +3441,15 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_or_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3458,15 +3458,15 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; ; GFX8-LABEL: atomic_or_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3476,10 +3476,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-LABEL: atomic_or_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3497,7 +3497,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_or_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -3516,7 +3516,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; ; GFX8-LABEL: atomic_or_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3535,7 +3535,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3558,7 +3558,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_xchg_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3573,7 +3573,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_xchg_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -3588,7 +3588,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3605,7 +3605,7 @@ entry: define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX7-LABEL: atomic_xchg_f64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3620,7 +3620,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; ; GFX8-LABEL: atomic_xchg_f64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -3635,7 +3635,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3652,7 +3652,7 @@ entry: define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX7-LABEL: atomic_xchg_pointer_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3667,7 +3667,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; ; GFX8-LABEL: atomic_xchg_pointer_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -3682,7 +3682,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3699,14 +3699,14 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_xchg_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3718,14 +3718,14 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; ; GFX8-LABEL: atomic_xchg_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3738,15 +3738,15 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_xchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -3759,17 +3759,17 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_xchg_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3778,17 +3778,17 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GFX8-LABEL: atomic_xchg_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3798,10 +3798,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_xchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -3820,7 +3820,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -3841,7 +3841,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX8-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3862,7 +3862,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3886,7 +3886,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_xchg_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -3899,7 +3899,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_xchg_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3912,7 +3912,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3928,50 +3928,50 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_xchg_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xchg_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -3983,15 +3983,15 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_xchg_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4000,15 +4000,15 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GFX8-LABEL: atomic_xchg_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4018,10 +4018,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_xchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -4039,7 +4039,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_xchg_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -4058,7 +4058,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX8-LABEL: atomic_xchg_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -4077,7 +4077,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -4100,7 +4100,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_xor_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4115,7 +4115,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_xor_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -4130,7 +4130,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4147,14 +4147,14 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_xor_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4166,14 +4166,14 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_xor_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4186,15 +4186,15 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_xor_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -4207,17 +4207,17 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_xor_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4226,17 +4226,17 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GFX8-LABEL: atomic_xor_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4246,10 +4246,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_xor_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -4268,7 +4268,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -4289,7 +4289,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX8-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -4310,7 +4310,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -4334,7 +4334,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_xor_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -4347,7 +4347,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_xor_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -4360,7 +4360,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4376,50 +4376,50 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_xor_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_xor_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -4431,15 +4431,15 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_xor_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4448,15 +4448,15 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GFX8-LABEL: atomic_xor_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4466,10 +4466,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_xor_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -4487,7 +4487,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_xor_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -4506,7 +4506,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_xor_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -4525,7 +4525,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -4548,7 +4548,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX7-LABEL: atomic_load_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4564,7 +4564,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; ; GFX8-LABEL: atomic_load_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -4580,7 +4580,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4599,7 +4599,7 @@ entry: define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GFX7-LABEL: atomic_load_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -4613,7 +4613,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; ; GFX8-LABEL: atomic_load_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -4627,7 +4627,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4645,10 +4645,10 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GFX7-LABEL: atomic_load_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 @@ -4665,10 +4665,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; ; GFX8-LABEL: atomic_load_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 @@ -4686,10 +4686,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-LABEL: atomic_load_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -4710,10 +4710,10 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) { ; GFX7-LABEL: atomic_load_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -4728,10 +4728,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; ; GFX8-LABEL: atomic_load_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -4747,10 +4747,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-LABEL: atomic_load_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -4770,7 +4770,7 @@ entry: define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX7-LABEL: atomic_store_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_add_u32 s0, s2, 32 @@ -4783,7 +4783,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; ; GFX8-LABEL: atomic_store_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_add_u32 s0, s2, 32 @@ -4796,7 +4796,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4812,7 +4812,7 @@ entry: define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GFX7-LABEL: atomic_store_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -4823,7 +4823,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; ; GFX8-LABEL: atomic_store_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -4834,7 +4834,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4849,34 +4849,34 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 %index) { ; GFX7-LABEL: atomic_store_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s6, s0 -; GFX7-NEXT: s_addc_u32 s1, s7, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s2, s0 +; GFX7-NEXT: s_addc_u32 s1, s3, s1 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_store_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s6, s0 -; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s2, s0 +; GFX8-NEXT: s_addc_u32 s1, s3, s1 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -4884,10 +4884,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-LABEL: atomic_store_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -4905,30 +4905,30 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index) { ; GFX7-LABEL: atomic_store_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s6, s0 -; GFX7-NEXT: s_addc_u32 s1, s7, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s2, s0 +; GFX7-NEXT: s_addc_u32 s1, s3, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_store_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s6, s0 -; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s2, s0 +; GFX8-NEXT: s_addc_u32 s1, s3, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -4936,10 +4936,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-LABEL: atomic_store_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -4956,17 +4956,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) { ; GFX7-LABEL: atomic_cmpxchg_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s2, s4, 32 -; GFX7-NEXT: s_addc_u32 s3, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 32 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -4974,17 +4974,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; ; GFX8-LABEL: atomic_cmpxchg_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_u32 s2, s4, 32 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -4993,12 +4993,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-LABEL: atomic_cmpxchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -5012,17 +5012,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %old) { ; GFX7-LABEL: atomic_cmpxchg_i64_soffset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s2, s4, 0x11940 -; GFX7-NEXT: s_addc_u32 s3, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 0x11940 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -5030,17 +5030,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; ; GFX8-LABEL: atomic_cmpxchg_i64_soffset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_u32 s2, s4, 0x11940 -; GFX8-NEXT: s_addc_u32 s3, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s0, s0, 0x11940 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -5049,12 +5049,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-LABEL: atomic_cmpxchg_i64_soffset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -5068,7 +5068,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) { ; GFX7-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -5088,7 +5088,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; ; GFX8-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -5108,7 +5108,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -5130,7 +5130,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) { ; GFX7-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 @@ -5150,7 +5150,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; ; GFX8-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 @@ -5170,7 +5170,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5191,67 +5191,67 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { ; GFX7-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: s_add_u32 s0, s4, s2 -; GFX7-NEXT: s_addc_u32 s3, s5, s3 +; GFX7-NEXT: s_add_u32 s0, s8, s2 +; GFX7-NEXT: s_addc_u32 s3, s9, s3 ; GFX7-NEXT: s_add_u32 s2, s0, 32 ; GFX7-NEXT: s_addc_u32 s3, s3, 0 ; GFX7-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: v_mov_b32_e32 v3, s11 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: s_add_u32 s0, s4, s2 -; GFX8-NEXT: s_addc_u32 s3, s5, s3 +; GFX8-NEXT: s_add_u32 s0, s8, s2 +; GFX8-NEXT: s_addc_u32 s3, s9, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: s_addc_u32 s3, s3, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] +; GFX12-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[8:9], s[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -5266,15 +5266,15 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GFX7-LABEL: atomic_cmpxchg_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -5282,15 +5282,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; ; GFX8-LABEL: atomic_cmpxchg_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -5299,12 +5299,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GFX12-LABEL: atomic_cmpxchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -5317,7 +5317,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) { ; GFX7-LABEL: atomic_cmpxchg_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: v_mov_b32_e32 v5, s1 @@ -5335,7 +5335,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; ; GFX8-LABEL: atomic_cmpxchg_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -5353,7 +5353,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -5374,7 +5374,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) { ; GFX7-LABEL: atomic_cmpxchg_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 @@ -5392,7 +5392,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; ; GFX8-LABEL: atomic_cmpxchg_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 @@ -5410,7 +5410,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5430,63 +5430,63 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { ; GFX7-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GFX7-NEXT: s_add_u32 s2, s4, s2 -; GFX7-NEXT: s_addc_u32 s3, s5, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; GFX7-NEXT: s_add_u32 s2, s8, s2 +; GFX7-NEXT: s_addc_u32 s3, s9, s3 ; GFX7-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: v_mov_b32_e32 v3, s11 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GFX8-NEXT: s_add_u32 s2, s4, s2 -; GFX8-NEXT: s_addc_u32 s3, s5, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; GFX8-NEXT: s_add_u32 s2, s8, s2 +; GFX8-NEXT: s_addc_u32 s3, s9, s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] +; GFX12-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[8:9], s[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -5500,7 +5500,7 @@ entry: define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX7-LABEL: atomic_load_f64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -5516,7 +5516,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; ; GFX8-LABEL: atomic_load_f64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -5532,7 +5532,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5551,7 +5551,7 @@ entry: define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GFX7-LABEL: atomic_load_f64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -5565,7 +5565,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; ; GFX8-LABEL: atomic_load_f64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -5579,7 +5579,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5597,10 +5597,10 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GFX7-LABEL: atomic_load_f64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 @@ -5617,10 +5617,10 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; ; GFX8-LABEL: atomic_load_f64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 @@ -5638,10 +5638,10 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-LABEL: atomic_load_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -5662,10 +5662,10 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) { ; GFX7-LABEL: atomic_load_f64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -5680,10 +5680,10 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; ; GFX8-LABEL: atomic_load_f64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -5699,10 +5699,10 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-LABEL: atomic_load_f64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -5722,7 +5722,7 @@ entry: define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX7-LABEL: atomic_store_f64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_add_u32 s0, s2, 32 @@ -5735,7 +5735,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; ; GFX8-LABEL: atomic_store_f64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_add_u32 s0, s2, 32 @@ -5748,7 +5748,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5764,7 +5764,7 @@ entry: define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GFX7-LABEL: atomic_store_f64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -5775,7 +5775,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; ; GFX8-LABEL: atomic_store_f64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -5786,7 +5786,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5801,34 +5801,34 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, i64 %index) { ; GFX7-LABEL: atomic_store_f64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s6, s0 -; GFX7-NEXT: s_addc_u32 s1, s7, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s2, s0 +; GFX7-NEXT: s_addc_u32 s1, s3, s1 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_store_f64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s6, s0 -; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s2, s0 +; GFX8-NEXT: s_addc_u32 s1, s3, s1 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -5836,10 +5836,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-LABEL: atomic_store_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -5857,30 +5857,30 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %index) { ; GFX7-LABEL: atomic_store_f64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s6, s0 -; GFX7-NEXT: s_addc_u32 s1, s7, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s2, s0 +; GFX7-NEXT: s_addc_u32 s1, s3, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_store_f64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s6, s0 -; GFX8-NEXT: s_addc_u32 s1, s7, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s2, s0 +; GFX8-NEXT: s_addc_u32 s1, s3, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -5888,10 +5888,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-LABEL: atomic_store_f64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -5908,7 +5908,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_inc_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -5923,7 +5923,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_inc_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -5938,7 +5938,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5955,14 +5955,14 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_inc_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5974,14 +5974,14 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_inc_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5994,15 +5994,15 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_inc_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -6015,17 +6015,17 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_inc_i64_incr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6034,17 +6034,17 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; ; GFX8-LABEL: atomic_inc_i64_incr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6054,10 +6054,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_inc_i64_incr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -6076,7 +6076,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_inc_i64_ret_incr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6097,7 +6097,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; ; GFX8-LABEL: atomic_inc_i64_ret_incr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -6118,7 +6118,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -6142,7 +6142,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_inc_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -6155,7 +6155,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_inc_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -6168,7 +6168,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_inc_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6184,50 +6184,50 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_inc_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_inc_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -6239,15 +6239,15 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_inc_i64_incr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6256,15 +6256,15 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; ; GFX8-LABEL: atomic_inc_i64_incr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6274,10 +6274,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_inc_i64_incr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -6295,7 +6295,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_inc_i64_ret_incr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6314,7 +6314,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_inc_i64_ret_incr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -6333,7 +6333,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -6356,7 +6356,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_dec_i64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -6371,7 +6371,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_dec_i64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 @@ -6386,7 +6386,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6403,14 +6403,14 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_dec_i64_ret_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6422,14 +6422,14 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_dec_i64_ret_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6442,15 +6442,15 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_dec_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -6463,17 +6463,17 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_dec_i64_decr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6482,17 +6482,17 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; ; GFX8-LABEL: atomic_dec_i64_decr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6502,10 +6502,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_dec_i64_decr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -6524,7 +6524,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_dec_i64_ret_decr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6545,7 +6545,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; ; GFX8-LABEL: atomic_dec_i64_ret_decr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -6566,7 +6566,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -6590,7 +6590,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_dec_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -6603,7 +6603,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_dec_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -6616,7 +6616,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_dec_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6632,50 +6632,50 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-LABEL: atomic_dec_i64_ret: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_dec_i64_ret: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -6687,15 +6687,15 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_dec_i64_decr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX7-NEXT: s_add_u32 s0, s4, s0 -; GFX7-NEXT: s_addc_u32 s1, s5, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6704,15 +6704,15 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; ; GFX8-LABEL: atomic_dec_i64_decr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX8-NEXT: s_add_u32 s0, s4, s0 -; GFX8-NEXT: s_addc_u32 s1, s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6722,10 +6722,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_dec_i64_decr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -6743,7 +6743,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_dec_i64_ret_decr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -6762,7 +6762,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_dec_i64_ret_decr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -6781,7 +6781,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index d7bd4b1e4918e8..2989f08ac56e7b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -6581,96 +6581,6 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ret void } -define void @flat_atomic_nand_i64_noret_offset__noalias_private(ptr %out, i64 %in) { -; GCN1-LABEL: flat_atomic_nand_i64_noret_offset__noalias_private: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v0, v7, v3 -; GCN1-NEXT: v_and_b32_e32 v1, v6, v2 -; GCN1-NEXT: v_not_b32_e32 v5, v0 -; GCN1-NEXT: v_not_b32_e32 v4, v1 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB52_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__noalias_private: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[0:1] -; GCN2-NEXT: flat_load_dword v6, v[8:9] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v0, v7, v3 -; GCN2-NEXT: v_and_b32_e32 v1, v6, v2 -; GCN2-NEXT: v_not_b32_e32 v5, v0 -; GCN2-NEXT: v_not_b32_e32 v4, v1 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB52_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__noalias_private: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 -; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 -; GCN3-NEXT: v_not_b32_e32 v5, v4 -; GCN3-NEXT: v_not_b32_e32 v4, v8 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB52_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 - ret void -} - define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_nand_i64_ret: ; GCN1: ; %bb.0: @@ -6682,14 +6592,14 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB53_4 +; GCN1-NEXT: s_cbranch_execz .LBB52_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB53_2: ; %atomicrmw.start +; GCN1-NEXT: .LBB52_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -6704,15 +6614,15 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN1-NEXT: s_cbranch_execnz .LBB53_2 +; GCN1-NEXT: s_cbranch_execnz .LBB52_2 ; GCN1-NEXT: ; %bb.3: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: ; implicit-def: $vgpr2 -; GCN1-NEXT: .LBB53_4: ; %Flow3 +; GCN1-NEXT: .LBB52_4: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB53_6 +; GCN1-NEXT: s_cbranch_execz .LBB52_6 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -6727,7 +6637,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_not_b32_e32 v3, v3 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GCN1-NEXT: .LBB53_6: ; %atomicrmw.phi +; GCN1-NEXT: .LBB52_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 @@ -6744,14 +6654,14 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB53_4 +; GCN2-NEXT: s_cbranch_execz .LBB52_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB53_2: ; %atomicrmw.start +; GCN2-NEXT: .LBB52_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -6766,15 +6676,15 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN2-NEXT: s_cbranch_execnz .LBB53_2 +; GCN2-NEXT: s_cbranch_execnz .LBB52_2 ; GCN2-NEXT: ; %bb.3: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: ; implicit-def: $vgpr2 -; GCN2-NEXT: .LBB53_4: ; %Flow3 +; GCN2-NEXT: .LBB52_4: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB53_6 +; GCN2-NEXT: s_cbranch_execz .LBB52_6 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -6789,7 +6699,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_not_b32_e32 v3, v3 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GCN2-NEXT: .LBB53_6: ; %atomicrmw.phi +; GCN2-NEXT: .LBB52_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 @@ -6806,17 +6716,17 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB53_3 +; GCN3-NEXT: s_cbranch_execnz .LBB52_3 ; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB53_6 -; GCN3-NEXT: .LBB53_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB52_6 +; GCN3-NEXT: .LBB52_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB53_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB52_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB53_4: ; %atomicrmw.start +; GCN3-NEXT: .LBB52_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -6831,15 +6741,15 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN3-NEXT: s_cbranch_execnz .LBB53_4 +; GCN3-NEXT: s_cbranch_execnz .LBB52_4 ; GCN3-NEXT: ; %bb.5: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB53_2 -; GCN3-NEXT: .LBB53_6: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB52_2 +; GCN3-NEXT: .LBB52_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -6872,20 +6782,20 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB54_3 +; GCN1-NEXT: s_cbranch_execnz .LBB53_3 ; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB54_6 -; GCN1-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB53_6 +; GCN1-NEXT: .LBB53_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB53_3: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB54_4: ; %atomicrmw.start +; GCN1-NEXT: .LBB53_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6900,15 +6810,15 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN1-NEXT: s_cbranch_execnz .LBB54_4 +; GCN1-NEXT: s_cbranch_execnz .LBB53_4 ; GCN1-NEXT: ; %bb.5: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB54_2 -; GCN1-NEXT: .LBB54_6: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB53_2 +; GCN1-NEXT: .LBB53_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -6938,20 +6848,20 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB54_3 +; GCN2-NEXT: s_cbranch_execnz .LBB53_3 ; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB54_6 -; GCN2-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB53_6 +; GCN2-NEXT: .LBB53_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB53_3: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB54_4: ; %atomicrmw.start +; GCN2-NEXT: .LBB53_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6966,15 +6876,15 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN2-NEXT: s_cbranch_execnz .LBB54_4 +; GCN2-NEXT: s_cbranch_execnz .LBB53_4 ; GCN2-NEXT: ; %bb.5: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB54_2 -; GCN2-NEXT: .LBB54_6: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB53_2 +; GCN2-NEXT: .LBB53_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -7002,17 +6912,17 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB54_3 +; GCN3-NEXT: s_cbranch_execnz .LBB53_3 ; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB54_6 -; GCN3-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB53_6 +; GCN3-NEXT: .LBB53_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB53_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB54_4: ; %atomicrmw.start +; GCN3-NEXT: .LBB53_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -7027,15 +6937,15 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN3-NEXT: s_cbranch_execnz .LBB54_4 +; GCN3-NEXT: s_cbranch_execnz .LBB53_4 ; GCN3-NEXT: ; %bb.5: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB54_2 -; GCN3-NEXT: .LBB54_6: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB53_2 +; GCN3-NEXT: .LBB53_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -7056,98 +6966,6 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ret i64 %result } -define i64 @flat_atomic_nand_i64_ret_offset__noalias_private(ptr %out, i64 %in) { -; GCN1-LABEL: flat_atomic_nand_i64_ret_offset__noalias_private: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_and_b32_e32 v0, v9, v3 -; GCN1-NEXT: v_and_b32_e32 v1, v8, v2 -; GCN1-NEXT: v_not_b32_e32 v7, v0 -; GCN1-NEXT: v_not_b32_e32 v6, v1 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB55_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__noalias_private: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_and_b32_e32 v0, v9, v3 -; GCN2-NEXT: v_and_b32_e32 v1, v8, v2 -; GCN2-NEXT: v_not_b32_e32 v7, v0 -; GCN2-NEXT: v_not_b32_e32 v6, v1 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB55_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__noalias_private: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 -; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 -; GCN3-NEXT: v_not_b32_e32 v5, v4 -; GCN3-NEXT: v_not_b32_e32 v4, v8 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB55_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 -; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 - ret i64 %result -} - define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) { ; GCN1-LABEL: flat_atomic_nand_i64_noret_scalar: ; GCN1: ; %bb.0: @@ -7159,13 +6977,13 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 -; GCN1-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN1-NEXT: s_cbranch_vccnz .LBB54_3 ; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccnz .LBB56_6 -; GCN1-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_vccnz .LBB54_6 +; GCN1-NEXT: .LBB54_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB56_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB54_3: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 @@ -7175,7 +6993,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB56_4: ; %atomicrmw.start +; GCN1-NEXT: .LBB54_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 @@ -7190,11 +7008,11 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB56_4 +; GCN1-NEXT: s_cbranch_execnz .LBB54_4 ; GCN1-NEXT: ; %bb.5: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_branch .LBB56_2 -; GCN1-NEXT: .LBB56_6: ; %atomicrmw.private +; GCN1-NEXT: s_branch .LBB54_2 +; GCN1-NEXT: .LBB54_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -7224,13 +7042,13 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 -; GCN2-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN2-NEXT: s_cbranch_vccnz .LBB54_3 ; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccnz .LBB56_6 -; GCN2-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_vccnz .LBB54_6 +; GCN2-NEXT: .LBB54_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB56_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB54_3: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s34, s4, 4 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -7240,7 +7058,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB56_4: ; %atomicrmw.start +; GCN2-NEXT: .LBB54_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 @@ -7255,11 +7073,11 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB56_4 +; GCN2-NEXT: s_cbranch_execnz .LBB54_4 ; GCN2-NEXT: ; %bb.5: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_branch .LBB56_2 -; GCN2-NEXT: .LBB56_6: ; %atomicrmw.private +; GCN2-NEXT: s_branch .LBB54_2 +; GCN2-NEXT: .LBB54_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -7286,18 +7104,18 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 -; GCN3-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN3-NEXT: s_cbranch_vccnz .LBB54_3 ; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccnz .LBB56_6 -; GCN3-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_vccnz .LBB54_6 +; GCN3-NEXT: .LBB54_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB56_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB54_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB56_4: ; %atomicrmw.start +; GCN3-NEXT: .LBB54_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 @@ -7312,11 +7130,11 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB56_4 +; GCN3-NEXT: s_cbranch_execnz .LBB54_4 ; GCN3-NEXT: ; %bb.5: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_branch .LBB56_2 -; GCN3-NEXT: .LBB56_6: ; %atomicrmw.private +; GCN3-NEXT: s_branch .LBB54_2 +; GCN3-NEXT: .LBB54_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -7349,13 +7167,13 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 -; GCN1-NEXT: s_cbranch_vccnz .LBB57_3 +; GCN1-NEXT: s_cbranch_vccnz .LBB55_3 ; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccnz .LBB57_6 -; GCN1-NEXT: .LBB57_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_vccnz .LBB55_6 +; GCN1-NEXT: .LBB55_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB57_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB55_3: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s36, s34, 4 ; GCN1-NEXT: s_addc_u32 s37, s35, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s36 @@ -7365,7 +7183,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB57_4: ; %atomicrmw.start +; GCN1-NEXT: .LBB55_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 @@ -7380,11 +7198,11 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_cbranch_execnz .LBB57_4 +; GCN1-NEXT: s_cbranch_execnz .LBB55_4 ; GCN1-NEXT: ; %bb.5: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_branch .LBB57_2 -; GCN1-NEXT: .LBB57_6: ; %atomicrmw.private +; GCN1-NEXT: s_branch .LBB55_2 +; GCN1-NEXT: .LBB55_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -7416,13 +7234,13 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 -; GCN2-NEXT: s_cbranch_vccnz .LBB57_3 +; GCN2-NEXT: s_cbranch_vccnz .LBB55_3 ; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccnz .LBB57_6 -; GCN2-NEXT: .LBB57_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_vccnz .LBB55_6 +; GCN2-NEXT: .LBB55_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB57_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB55_3: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s36, s34, 4 ; GCN2-NEXT: s_addc_u32 s37, s35, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s36 @@ -7432,7 +7250,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB57_4: ; %atomicrmw.start +; GCN2-NEXT: .LBB55_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 @@ -7447,11 +7265,11 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_cbranch_execnz .LBB57_4 +; GCN2-NEXT: s_cbranch_execnz .LBB55_4 ; GCN2-NEXT: ; %bb.5: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_branch .LBB57_2 -; GCN2-NEXT: .LBB57_6: ; %atomicrmw.private +; GCN2-NEXT: s_branch .LBB55_2 +; GCN2-NEXT: .LBB55_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -7480,18 +7298,18 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 -; GCN3-NEXT: s_cbranch_vccnz .LBB57_3 +; GCN3-NEXT: s_cbranch_vccnz .LBB55_3 ; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccnz .LBB57_6 -; GCN3-NEXT: .LBB57_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_vccnz .LBB55_6 +; GCN3-NEXT: .LBB55_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB57_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB55_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v4, s34 ; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: .LBB57_4: ; %atomicrmw.start +; GCN3-NEXT: .LBB55_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 @@ -7506,11 +7324,11 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN3-NEXT: s_cbranch_execnz .LBB57_4 +; GCN3-NEXT: s_cbranch_execnz .LBB55_4 ; GCN3-NEXT: ; %bb.5: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN3-NEXT: s_branch .LBB57_2 -; GCN3-NEXT: .LBB57_6: ; %atomicrmw.private +; GCN3-NEXT: s_branch .LBB55_2 +; GCN3-NEXT: .LBB55_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -7531,184 +7349,82 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ret void } -define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar__noalias_addrspace(ptr inreg %out, i64 inreg %in) { -; GCN1-LABEL: flat_atomic_nand_i64_noret_offset_scalar__noalias_addrspace: +define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { +; GCN1-LABEL: flat_atomic_nand_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 32 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB56_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s34, s4, 4 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: s_add_u32 s36, s4, 36 -; GCN1-NEXT: s_addc_u32 s37, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s36 -; GCN1-NEXT: v_mov_b32_e32 v1, s37 -; GCN1-NEXT: v_mov_b32_e32 v4, s34 -; GCN1-NEXT: v_mov_b32_e32 v5, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 -; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN1-NEXT: v_not_b32_e32 v1, v0 -; GCN1-NEXT: v_not_b32_e32 v0, v6 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN1-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN1-NEXT: v_not_b32_e32 v5, v0 +; GCN1-NEXT: v_not_b32_e32 v4, v1 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB58_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB56_2 +; GCN1-NEXT: ; %bb.3: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB56_6 +; GCN1-NEXT: .LBB56_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB56_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN1-NEXT: v_not_b32_e32 v4, v4 +; GCN1-NEXT: v_not_b32_e32 v5, v5 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB56_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar__noalias_addrspace: +; GCN2-LABEL: flat_atomic_nand_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 32 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: s_add_u32 s36, s4, 36 -; GCN2-NEXT: s_addc_u32 s37, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s36 -; GCN2-NEXT: v_mov_b32_e32 v1, s37 -; GCN2-NEXT: v_mov_b32_e32 v4, s34 -; GCN2-NEXT: v_mov_b32_e32 v5, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: flat_load_dword v2, v[4:5] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 -; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN2-NEXT: v_not_b32_e32 v1, v0 -; GCN2-NEXT: v_not_b32_e32 v0, v6 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB58_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar__noalias_addrspace: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 -; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 -; GCN3-NEXT: v_not_b32_e32 v1, v0 -; GCN3-NEXT: v_not_b32_e32 v0, v6 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB58_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 - ret void -} - -define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) { -; GCN1-LABEL: flat_atomic_nand_i64_ret_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 -; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s5, s34 -; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB59_4 -; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: flat_load_dword v0, v[2:3] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB59_2: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN1-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN1-NEXT: v_not_b32_e32 v5, v0 -; GCN1-NEXT: v_not_b32_e32 v4, v1 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB59_2 -; GCN1-NEXT: ; %bb.3: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_branch .LBB59_6 -; GCN1-NEXT: .LBB59_4: -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: s_cbranch_execz .LBB59_6 -; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 -; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec -; GCN1-NEXT: s_cselect_b32 s34, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: s_add_i32 s34, s34, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen -; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_and_b32_e32 v4, s6, v0 -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 -; GCN1-NEXT: v_not_b32_e32 v4, v4 -; GCN1-NEXT: v_not_b32_e32 v5, v5 -; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB59_6: ; %atomicrmw.phi -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: flat_atomic_nand_i64_ret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 -; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s5, s34 -; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB59_4 -; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB56_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s34, s4, 4 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 @@ -7717,7 +7433,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB59_2: ; %atomicrmw.start +; GCN2-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -7732,13 +7448,13 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB59_2 +; GCN2-NEXT: s_cbranch_execnz .LBB56_2 ; GCN2-NEXT: ; %bb.3: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_branch .LBB59_6 -; GCN2-NEXT: .LBB59_4: +; GCN2-NEXT: s_branch .LBB56_6 +; GCN2-NEXT: .LBB56_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: s_cbranch_execz .LBB59_6 +; GCN2-NEXT: s_cbranch_execz .LBB56_6 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 @@ -7755,7 +7471,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_not_b32_e32 v5, v5 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB59_6: ; %atomicrmw.phi +; GCN2-NEXT: .LBB56_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -7766,13 +7482,13 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB59_4 +; GCN3-NEXT: s_cbranch_vccz .LBB56_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB59_2: ; %atomicrmw.start +; GCN3-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -7787,13 +7503,13 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB59_2 +; GCN3-NEXT: s_cbranch_execnz .LBB56_2 ; GCN3-NEXT: ; %bb.3: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_branch .LBB59_6 -; GCN3-NEXT: .LBB59_4: +; GCN3-NEXT: s_branch .LBB56_6 +; GCN3-NEXT: .LBB56_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: s_cbranch_execz .LBB59_6 +; GCN3-NEXT: s_cbranch_execz .LBB56_6 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 @@ -7808,7 +7524,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_not_b32_e32 v3, v3 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB59_6: ; %atomicrmw.phi +; GCN3-NEXT: .LBB56_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i64 %in seq_cst @@ -7827,7 +7543,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB60_4 +; GCN1-NEXT: s_cbranch_vccz .LBB57_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s36, s34, 4 ; GCN1-NEXT: s_addc_u32 s37, s35, 0 @@ -7838,7 +7554,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB60_2: ; %atomicrmw.start +; GCN1-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -7853,13 +7569,13 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_cbranch_execnz .LBB60_2 +; GCN1-NEXT: s_cbranch_execnz .LBB57_2 ; GCN1-NEXT: ; %bb.3: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_branch .LBB60_6 -; GCN1-NEXT: .LBB60_4: +; GCN1-NEXT: s_branch .LBB57_6 +; GCN1-NEXT: .LBB57_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: s_cbranch_execz .LBB60_6 +; GCN1-NEXT: s_cbranch_execz .LBB57_6 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec @@ -7877,7 +7593,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_not_b32_e32 v5, v5 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB60_6: ; %atomicrmw.phi +; GCN1-NEXT: .LBB57_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -7892,7 +7608,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB60_4 +; GCN2-NEXT: s_cbranch_vccz .LBB57_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s36, s34, 4 ; GCN2-NEXT: s_addc_u32 s37, s35, 0 @@ -7903,7 +7619,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB60_2: ; %atomicrmw.start +; GCN2-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -7918,13 +7634,13 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_cbranch_execnz .LBB60_2 +; GCN2-NEXT: s_cbranch_execnz .LBB57_2 ; GCN2-NEXT: ; %bb.3: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_branch .LBB60_6 -; GCN2-NEXT: .LBB60_4: +; GCN2-NEXT: s_branch .LBB57_6 +; GCN2-NEXT: .LBB57_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: s_cbranch_execz .LBB60_6 +; GCN2-NEXT: s_cbranch_execz .LBB57_6 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 @@ -7941,7 +7657,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_not_b32_e32 v5, v5 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB60_6: ; %atomicrmw.phi +; GCN2-NEXT: .LBB57_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -7954,13 +7670,13 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB60_4 +; GCN3-NEXT: s_cbranch_vccz .LBB57_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v2, s34 ; GCN3-NEXT: v_mov_b32_e32 v3, s35 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: .LBB60_2: ; %atomicrmw.start +; GCN3-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -7975,13 +7691,13 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN3-NEXT: s_cbranch_execnz .LBB60_2 +; GCN3-NEXT: s_cbranch_execnz .LBB57_2 ; GCN3-NEXT: ; %bb.3: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN3-NEXT: s_branch .LBB60_6 -; GCN3-NEXT: .LBB60_4: +; GCN3-NEXT: s_branch .LBB57_6 +; GCN3-NEXT: .LBB57_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: s_cbranch_execz .LBB60_6 +; GCN3-NEXT: s_cbranch_execz .LBB57_6 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 @@ -7996,7 +7712,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_not_b32_e32 v3, v3 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB60_6: ; %atomicrmw.phi +; GCN3-NEXT: .LBB57_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -8004,108 +7720,6 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ret i64 %result } -define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar__noalias_private(ptr inreg %out, i64 inreg %in) { -; GCN1-LABEL: flat_atomic_nand_i64_ret_offset_scalar__noalias_private: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 32 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: s_add_u32 s36, s4, 36 -; GCN1-NEXT: s_addc_u32 s37, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s36 -; GCN1-NEXT: v_mov_b32_e32 v1, s37 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: flat_load_dword v0, v[2:3] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB61_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN1-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN1-NEXT: v_not_b32_e32 v5, v0 -; GCN1-NEXT: v_not_b32_e32 v4, v1 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB61_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar__noalias_private: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 32 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: s_add_u32 s36, s4, 36 -; GCN2-NEXT: s_addc_u32 s37, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s36 -; GCN2-NEXT: v_mov_b32_e32 v1, s37 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: flat_load_dword v0, v[2:3] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB61_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN2-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN2-NEXT: v_not_b32_e32 v5, v0 -; GCN2-NEXT: v_not_b32_e32 v4, v1 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB61_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar__noalias_private: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB61_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN3-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN3-NEXT: v_not_b32_e32 v5, v0 -; GCN3-NEXT: v_not_b32_e32 v4, v1 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB61_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 - ret i64 %result -} - define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: @@ -8118,20 +7732,20 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB62_3 +; GCN1-NEXT: s_cbranch_execnz .LBB58_3 ; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB62_6 -; GCN1-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB58_6 +; GCN1-NEXT: .LBB58_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB58_3: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB62_4: ; %atomicrmw.start +; GCN1-NEXT: .LBB58_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v4, v7, v3 @@ -8146,15 +7760,15 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN1-NEXT: s_cbranch_execnz .LBB62_4 +; GCN1-NEXT: s_cbranch_execnz .LBB58_4 ; GCN1-NEXT: ; %bb.5: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB62_2 -; GCN1-NEXT: .LBB62_6: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB58_2 +; GCN1-NEXT: .LBB58_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -8183,20 +7797,20 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB62_3 +; GCN2-NEXT: s_cbranch_execnz .LBB58_3 ; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB62_6 -; GCN2-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB58_6 +; GCN2-NEXT: .LBB58_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB58_3: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB62_4: ; %atomicrmw.start +; GCN2-NEXT: .LBB58_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v4, v7, v3 @@ -8211,15 +7825,15 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN2-NEXT: s_cbranch_execnz .LBB62_4 +; GCN2-NEXT: s_cbranch_execnz .LBB58_4 ; GCN2-NEXT: ; %bb.5: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB62_2 -; GCN2-NEXT: .LBB62_6: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB58_2 +; GCN2-NEXT: .LBB58_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -8246,17 +7860,17 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB62_3 +; GCN3-NEXT: s_cbranch_execnz .LBB58_3 ; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB62_6 -; GCN3-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB58_6 +; GCN3-NEXT: .LBB58_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB58_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB62_4: ; %atomicrmw.start +; GCN3-NEXT: .LBB58_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 @@ -8271,15 +7885,15 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN3-NEXT: s_cbranch_execnz .LBB62_4 +; GCN3-NEXT: s_cbranch_execnz .LBB58_4 ; GCN3-NEXT: ; %bb.5: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB62_2 -; GCN3-NEXT: .LBB62_6: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB58_2 +; GCN3-NEXT: .LBB58_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -8313,20 +7927,20 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB63_3 +; GCN1-NEXT: s_cbranch_execnz .LBB59_3 ; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB63_6 -; GCN1-NEXT: .LBB63_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB59_6 +; GCN1-NEXT: .LBB59_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB63_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB59_3: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[6:7], 0 -; GCN1-NEXT: .LBB63_4: ; %atomicrmw.start +; GCN1-NEXT: .LBB59_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -8341,15 +7955,15 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN1-NEXT: s_cbranch_execnz .LBB63_4 +; GCN1-NEXT: s_cbranch_execnz .LBB59_4 ; GCN1-NEXT: ; %bb.5: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB63_2 -; GCN1-NEXT: .LBB63_6: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB59_2 +; GCN1-NEXT: .LBB59_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -8379,20 +7993,20 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB63_3 +; GCN2-NEXT: s_cbranch_execnz .LBB59_3 ; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB63_6 -; GCN2-NEXT: .LBB63_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB59_6 +; GCN2-NEXT: .LBB59_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB63_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB59_3: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[6:7], 0 -; GCN2-NEXT: .LBB63_4: ; %atomicrmw.start +; GCN2-NEXT: .LBB59_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -8407,15 +8021,15 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN2-NEXT: s_cbranch_execnz .LBB63_4 +; GCN2-NEXT: s_cbranch_execnz .LBB59_4 ; GCN2-NEXT: ; %bb.5: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB63_2 -; GCN2-NEXT: .LBB63_6: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB59_2 +; GCN2-NEXT: .LBB59_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -8443,17 +8057,17 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB63_3 +; GCN3-NEXT: s_cbranch_execnz .LBB59_3 ; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB63_6 -; GCN3-NEXT: .LBB63_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB59_6 +; GCN3-NEXT: .LBB59_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB63_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB59_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GCN3-NEXT: s_mov_b64 s[6:7], 0 -; GCN3-NEXT: .LBB63_4: ; %atomicrmw.start +; GCN3-NEXT: .LBB59_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -8468,15 +8082,15 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN3-NEXT: s_cbranch_execnz .LBB63_4 +; GCN3-NEXT: s_cbranch_execnz .LBB59_4 ; GCN3-NEXT: ; %bb.5: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB63_2 -; GCN3-NEXT: .LBB63_6: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB59_2 +; GCN3-NEXT: .LBB59_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -8511,22 +8125,22 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB64_3 +; GCN1-NEXT: s_cbranch_execnz .LBB60_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB64_4 -; GCN1-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB60_4 +; GCN1-NEXT: .LBB60_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB60_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB64_2 -; GCN1-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB60_2 +; GCN1-NEXT: .LBB60_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -8551,22 +8165,22 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB64_3 +; GCN2-NEXT: s_cbranch_execnz .LBB60_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB64_4 -; GCN2-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB60_4 +; GCN2-NEXT: .LBB60_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB60_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB64_2 -; GCN2-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB60_2 +; GCN2-NEXT: .LBB60_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -8589,22 +8203,22 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB64_3 +; GCN3-NEXT: s_cbranch_execnz .LBB60_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB64_4 -; GCN3-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB60_4 +; GCN3-NEXT: .LBB60_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB64_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB60_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB64_2 -; GCN3-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB60_2 +; GCN3-NEXT: .LBB60_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -8634,22 +8248,22 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB65_3 +; GCN1-NEXT: s_cbranch_execnz .LBB61_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB65_4 -; GCN1-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB61_4 +; GCN1-NEXT: .LBB61_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB61_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB65_2 -; GCN1-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB61_2 +; GCN1-NEXT: .LBB61_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -8676,22 +8290,22 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB65_3 +; GCN2-NEXT: s_cbranch_execnz .LBB61_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB65_4 -; GCN2-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB61_4 +; GCN2-NEXT: .LBB61_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB61_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB65_2 -; GCN2-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB61_2 +; GCN2-NEXT: .LBB61_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -8716,22 +8330,22 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB65_3 +; GCN3-NEXT: s_cbranch_execnz .LBB61_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB65_4 -; GCN3-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB61_4 +; GCN3-NEXT: .LBB61_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB65_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB61_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB65_2 -; GCN3-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB61_2 +; GCN3-NEXT: .LBB61_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -8763,22 +8377,22 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB66_3 +; GCN1-NEXT: s_cbranch_execnz .LBB62_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB66_4 -; GCN1-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB62_4 +; GCN1-NEXT: .LBB62_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB66_2 -; GCN1-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB62_2 +; GCN1-NEXT: .LBB62_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -8806,22 +8420,22 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB66_3 +; GCN2-NEXT: s_cbranch_execnz .LBB62_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB66_4 -; GCN2-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB62_4 +; GCN2-NEXT: .LBB62_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB66_2 -; GCN2-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB62_2 +; GCN2-NEXT: .LBB62_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -8847,22 +8461,22 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB66_3 +; GCN3-NEXT: s_cbranch_execnz .LBB62_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB66_4 -; GCN3-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB62_4 +; GCN3-NEXT: .LBB62_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB62_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB66_2 -; GCN3-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB62_2 +; GCN3-NEXT: .LBB62_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -8893,22 +8507,22 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB67_3 +; GCN1-NEXT: s_cbranch_execnz .LBB63_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB67_4 -; GCN1-NEXT: .LBB67_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB63_4 +; GCN1-NEXT: .LBB63_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB67_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB63_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB67_2 -; GCN1-NEXT: .LBB67_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB63_2 +; GCN1-NEXT: .LBB63_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -8936,22 +8550,22 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB67_3 +; GCN2-NEXT: s_cbranch_execnz .LBB63_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB67_4 -; GCN2-NEXT: .LBB67_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB63_4 +; GCN2-NEXT: .LBB63_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB67_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB63_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB67_2 -; GCN2-NEXT: .LBB67_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB63_2 +; GCN2-NEXT: .LBB63_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -8977,22 +8591,22 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB67_3 +; GCN3-NEXT: s_cbranch_execnz .LBB63_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB67_4 -; GCN3-NEXT: .LBB67_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB63_4 +; GCN3-NEXT: .LBB63_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB67_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB63_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB67_2 -; GCN3-NEXT: .LBB67_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB63_2 +; GCN3-NEXT: .LBB63_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -9022,13 +8636,13 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 -; GCN1-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN1-NEXT: s_cbranch_vccnz .LBB64_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB68_4 -; GCN1-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_vccz .LBB64_4 +; GCN1-NEXT: .LBB64_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -9036,8 +8650,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB68_2 -; GCN1-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execnz .LBB64_2 +; GCN1-NEXT: .LBB64_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -9065,13 +8679,13 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 -; GCN2-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN2-NEXT: s_cbranch_vccnz .LBB64_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB68_4 -; GCN2-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_vccz .LBB64_4 +; GCN2-NEXT: .LBB64_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -9079,8 +8693,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB68_2 -; GCN2-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execnz .LBB64_2 +; GCN2-NEXT: .LBB64_4: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -9105,13 +8719,13 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 -; GCN3-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN3-NEXT: s_cbranch_vccnz .LBB64_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB68_4 -; GCN3-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_vccz .LBB64_4 +; GCN3-NEXT: .LBB64_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 @@ -9119,8 +8733,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB68_2 -; GCN3-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execnz .LBB64_2 +; GCN3-NEXT: .LBB64_4: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -9151,13 +8765,13 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 -; GCN1-NEXT: s_cbranch_vccnz .LBB69_3 +; GCN1-NEXT: s_cbranch_vccnz .LBB65_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB69_4 -; GCN1-NEXT: .LBB69_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_vccz .LBB65_4 +; GCN1-NEXT: .LBB65_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB69_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 @@ -9165,8 +8779,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB69_2 -; GCN1-NEXT: .LBB69_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execnz .LBB65_2 +; GCN1-NEXT: .LBB65_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -9196,13 +8810,13 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 -; GCN2-NEXT: s_cbranch_vccnz .LBB69_3 +; GCN2-NEXT: s_cbranch_vccnz .LBB65_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB69_4 -; GCN2-NEXT: .LBB69_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_vccz .LBB65_4 +; GCN2-NEXT: .LBB65_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB69_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 @@ -9210,8 +8824,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB69_2 -; GCN2-NEXT: .LBB69_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execnz .LBB65_2 +; GCN2-NEXT: .LBB65_4: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -9238,13 +8852,13 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 -; GCN3-NEXT: s_cbranch_vccnz .LBB69_3 +; GCN3-NEXT: s_cbranch_vccnz .LBB65_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB69_4 -; GCN3-NEXT: .LBB69_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_vccz .LBB65_4 +; GCN3-NEXT: .LBB65_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB69_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 @@ -9252,8 +8866,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB69_2 -; GCN3-NEXT: .LBB69_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execnz .LBB65_2 +; GCN3-NEXT: .LBB65_4: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -9282,7 +8896,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB70_2 +; GCN1-NEXT: s_cbranch_vccz .LBB66_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s6 @@ -9291,11 +8905,11 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB70_3 -; GCN1-NEXT: s_branch .LBB70_4 -; GCN1-NEXT: .LBB70_2: +; GCN1-NEXT: s_cbranch_execz .LBB66_3 +; GCN1-NEXT: s_branch .LBB66_4 +; GCN1-NEXT: .LBB66_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB70_3: ; %atomicrmw.private +; GCN1-NEXT: .LBB66_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -9310,7 +8924,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN1-NEXT: v_or_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB70_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB66_4: ; %atomicrmw.end ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -9323,7 +8937,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB70_2 +; GCN2-NEXT: s_cbranch_vccz .LBB66_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s6 @@ -9332,11 +8946,11 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB70_3 -; GCN2-NEXT: s_branch .LBB70_4 -; GCN2-NEXT: .LBB70_2: +; GCN2-NEXT: s_cbranch_execz .LBB66_3 +; GCN2-NEXT: s_branch .LBB66_4 +; GCN2-NEXT: .LBB66_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB70_3: ; %atomicrmw.private +; GCN2-NEXT: .LBB66_3: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -9350,7 +8964,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN2-NEXT: v_or_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB70_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB66_4: ; %atomicrmw.end ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -9361,7 +8975,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB70_2 +; GCN3-NEXT: s_cbranch_vccz .LBB66_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -9370,11 +8984,11 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB70_3 -; GCN3-NEXT: s_branch .LBB70_4 -; GCN3-NEXT: .LBB70_2: +; GCN3-NEXT: s_cbranch_execz .LBB66_3 +; GCN3-NEXT: s_branch .LBB66_4 +; GCN3-NEXT: .LBB66_2: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB70_3: ; %atomicrmw.private +; GCN3-NEXT: .LBB66_3: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -9386,7 +9000,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN3-NEXT: v_or_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB70_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB66_4: ; %atomicrmw.end ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst @@ -9405,7 +9019,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB71_2 +; GCN1-NEXT: s_cbranch_vccz .LBB67_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v2, s6 @@ -9414,11 +9028,11 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB71_3 -; GCN1-NEXT: s_branch .LBB71_4 -; GCN1-NEXT: .LBB71_2: +; GCN1-NEXT: s_cbranch_execz .LBB67_3 +; GCN1-NEXT: s_branch .LBB67_4 +; GCN1-NEXT: .LBB67_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN1-NEXT: .LBB67_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -9433,7 +9047,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_or_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB71_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB67_4: ; %atomicrmw.end ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -9448,7 +9062,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB71_2 +; GCN2-NEXT: s_cbranch_vccz .LBB67_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v2, s6 @@ -9457,11 +9071,11 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB71_3 -; GCN2-NEXT: s_branch .LBB71_4 -; GCN2-NEXT: .LBB71_2: +; GCN2-NEXT: s_cbranch_execz .LBB67_3 +; GCN2-NEXT: s_branch .LBB67_4 +; GCN2-NEXT: .LBB67_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN2-NEXT: .LBB67_3: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -9475,7 +9089,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_or_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB71_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB67_4: ; %atomicrmw.end ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -9488,7 +9102,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB71_2 +; GCN3-NEXT: s_cbranch_vccz .LBB67_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -9497,11 +9111,11 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB71_3 -; GCN3-NEXT: s_branch .LBB71_4 -; GCN3-NEXT: .LBB71_2: +; GCN3-NEXT: s_cbranch_execz .LBB67_3 +; GCN3-NEXT: s_branch .LBB67_4 +; GCN3-NEXT: .LBB67_2: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN3-NEXT: .LBB67_3: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -9513,7 +9127,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_or_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB71_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB67_4: ; %atomicrmw.end ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -9533,22 +9147,22 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB72_3 +; GCN1-NEXT: s_cbranch_execnz .LBB68_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB72_4 -; GCN1-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB68_4 +; GCN1-NEXT: .LBB68_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB72_2 -; GCN1-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB68_2 +; GCN1-NEXT: .LBB68_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -9575,22 +9189,22 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB72_3 +; GCN2-NEXT: s_cbranch_execnz .LBB68_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB72_4 -; GCN2-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB68_4 +; GCN2-NEXT: .LBB68_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB72_2 -; GCN2-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB68_2 +; GCN2-NEXT: .LBB68_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -9615,22 +9229,22 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB72_3 +; GCN3-NEXT: s_cbranch_execnz .LBB68_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB72_4 -; GCN3-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB68_4 +; GCN3-NEXT: .LBB68_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB72_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB68_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB72_2 -; GCN3-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB68_2 +; GCN3-NEXT: .LBB68_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -9662,22 +9276,22 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB73_3 +; GCN1-NEXT: s_cbranch_execnz .LBB69_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB73_4 -; GCN1-NEXT: .LBB73_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB69_4 +; GCN1-NEXT: .LBB69_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB73_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB69_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB73_2 -; GCN1-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB69_2 +; GCN1-NEXT: .LBB69_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -9705,22 +9319,22 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB73_3 +; GCN2-NEXT: s_cbranch_execnz .LBB69_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB73_4 -; GCN2-NEXT: .LBB73_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB69_4 +; GCN2-NEXT: .LBB69_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB73_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB69_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB73_2 -; GCN2-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB69_2 +; GCN2-NEXT: .LBB69_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -9746,22 +9360,22 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB73_3 +; GCN3-NEXT: s_cbranch_execnz .LBB69_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB73_4 -; GCN3-NEXT: .LBB73_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB69_4 +; GCN3-NEXT: .LBB69_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB73_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB69_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB73_2 -; GCN3-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB69_2 +; GCN3-NEXT: .LBB69_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -9794,22 +9408,22 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB74_3 +; GCN1-NEXT: s_cbranch_execnz .LBB70_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB74_4 -; GCN1-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB70_4 +; GCN1-NEXT: .LBB70_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB70_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB74_2 -; GCN1-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB70_2 +; GCN1-NEXT: .LBB70_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -9834,22 +9448,22 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB74_3 +; GCN2-NEXT: s_cbranch_execnz .LBB70_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB74_4 -; GCN2-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB70_4 +; GCN2-NEXT: .LBB70_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB70_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB74_2 -; GCN2-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB70_2 +; GCN2-NEXT: .LBB70_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -9872,22 +9486,22 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB74_3 +; GCN3-NEXT: s_cbranch_execnz .LBB70_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB74_4 -; GCN3-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB70_4 +; GCN3-NEXT: .LBB70_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB74_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB70_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB74_2 -; GCN3-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB70_2 +; GCN3-NEXT: .LBB70_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -9917,22 +9531,22 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB75_3 +; GCN1-NEXT: s_cbranch_execnz .LBB71_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB75_4 -; GCN1-NEXT: .LBB75_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB71_4 +; GCN1-NEXT: .LBB71_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB75_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB71_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB75_2 -; GCN1-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB71_2 +; GCN1-NEXT: .LBB71_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -9959,22 +9573,22 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB75_3 +; GCN2-NEXT: s_cbranch_execnz .LBB71_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB75_4 -; GCN2-NEXT: .LBB75_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB71_4 +; GCN2-NEXT: .LBB71_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB75_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB71_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB75_2 -; GCN2-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB71_2 +; GCN2-NEXT: .LBB71_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -9999,22 +9613,22 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB75_3 +; GCN3-NEXT: s_cbranch_execnz .LBB71_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB75_4 -; GCN3-NEXT: .LBB75_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB71_4 +; GCN3-NEXT: .LBB71_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB75_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB71_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB75_2 -; GCN3-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB71_2 +; GCN3-NEXT: .LBB71_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -10046,22 +9660,22 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB76_3 +; GCN1-NEXT: s_cbranch_execnz .LBB72_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB76_4 -; GCN1-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB72_4 +; GCN1-NEXT: .LBB72_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB76_2 -; GCN1-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB72_2 +; GCN1-NEXT: .LBB72_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -10089,22 +9703,22 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB76_3 +; GCN2-NEXT: s_cbranch_execnz .LBB72_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB76_4 -; GCN2-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB72_4 +; GCN2-NEXT: .LBB72_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB76_2 -; GCN2-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB72_2 +; GCN2-NEXT: .LBB72_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -10130,22 +9744,22 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB76_3 +; GCN3-NEXT: s_cbranch_execnz .LBB72_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB76_4 -; GCN3-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB72_4 +; GCN3-NEXT: .LBB72_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB76_2 -; GCN3-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB72_2 +; GCN3-NEXT: .LBB72_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -10176,22 +9790,22 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB77_3 +; GCN1-NEXT: s_cbranch_execnz .LBB73_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB77_4 -; GCN1-NEXT: .LBB77_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB73_4 +; GCN1-NEXT: .LBB73_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB77_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB73_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB77_2 -; GCN1-NEXT: .LBB77_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB73_2 +; GCN1-NEXT: .LBB73_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -10219,22 +9833,22 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB77_3 +; GCN2-NEXT: s_cbranch_execnz .LBB73_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB77_4 -; GCN2-NEXT: .LBB77_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB73_4 +; GCN2-NEXT: .LBB73_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB77_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB73_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB77_2 -; GCN2-NEXT: .LBB77_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB73_2 +; GCN2-NEXT: .LBB73_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -10260,22 +9874,22 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB77_3 +; GCN3-NEXT: s_cbranch_execnz .LBB73_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB77_4 -; GCN3-NEXT: .LBB77_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB73_4 +; GCN3-NEXT: .LBB73_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB77_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB73_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB77_2 -; GCN3-NEXT: .LBB77_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB73_2 +; GCN3-NEXT: .LBB73_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -10305,13 +9919,13 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 -; GCN1-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN1-NEXT: s_cbranch_vccnz .LBB74_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB78_4 -; GCN1-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_vccz .LBB74_4 +; GCN1-NEXT: .LBB74_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -10319,8 +9933,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB78_2 -; GCN1-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execnz .LBB74_2 +; GCN1-NEXT: .LBB74_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -10348,13 +9962,13 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 -; GCN2-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN2-NEXT: s_cbranch_vccnz .LBB74_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB78_4 -; GCN2-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_vccz .LBB74_4 +; GCN2-NEXT: .LBB74_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -10362,8 +9976,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB78_2 -; GCN2-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execnz .LBB74_2 +; GCN2-NEXT: .LBB74_4: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -10388,13 +10002,13 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 -; GCN3-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN3-NEXT: s_cbranch_vccnz .LBB74_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB78_4 -; GCN3-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_vccz .LBB74_4 +; GCN3-NEXT: .LBB74_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 @@ -10402,8 +10016,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB78_2 -; GCN3-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execnz .LBB74_2 +; GCN3-NEXT: .LBB74_4: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -10434,13 +10048,13 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 -; GCN1-NEXT: s_cbranch_vccnz .LBB79_3 +; GCN1-NEXT: s_cbranch_vccnz .LBB75_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB79_4 -; GCN1-NEXT: .LBB79_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_vccz .LBB75_4 +; GCN1-NEXT: .LBB75_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB79_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB75_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 @@ -10448,8 +10062,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB79_2 -; GCN1-NEXT: .LBB79_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execnz .LBB75_2 +; GCN1-NEXT: .LBB75_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -10479,13 +10093,13 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 -; GCN2-NEXT: s_cbranch_vccnz .LBB79_3 +; GCN2-NEXT: s_cbranch_vccnz .LBB75_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB79_4 -; GCN2-NEXT: .LBB79_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_vccz .LBB75_4 +; GCN2-NEXT: .LBB75_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB79_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB75_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 @@ -10493,8 +10107,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB79_2 -; GCN2-NEXT: .LBB79_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execnz .LBB75_2 +; GCN2-NEXT: .LBB75_4: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -10521,13 +10135,13 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 -; GCN3-NEXT: s_cbranch_vccnz .LBB79_3 +; GCN3-NEXT: s_cbranch_vccnz .LBB75_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB79_4 -; GCN3-NEXT: .LBB79_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_vccz .LBB75_4 +; GCN3-NEXT: .LBB75_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB79_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB75_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 @@ -10535,8 +10149,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB79_2 -; GCN3-NEXT: .LBB79_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execnz .LBB75_2 +; GCN3-NEXT: .LBB75_4: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -10565,7 +10179,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB80_2 +; GCN1-NEXT: s_cbranch_vccz .LBB76_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s6 @@ -10574,11 +10188,11 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB80_3 -; GCN1-NEXT: s_branch .LBB80_4 -; GCN1-NEXT: .LBB80_2: +; GCN1-NEXT: s_cbranch_execz .LBB76_3 +; GCN1-NEXT: s_branch .LBB76_4 +; GCN1-NEXT: .LBB76_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB80_3: ; %atomicrmw.private +; GCN1-NEXT: .LBB76_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -10593,7 +10207,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB80_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB76_4: ; %atomicrmw.end ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -10606,7 +10220,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB80_2 +; GCN2-NEXT: s_cbranch_vccz .LBB76_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s6 @@ -10615,11 +10229,11 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB80_3 -; GCN2-NEXT: s_branch .LBB80_4 -; GCN2-NEXT: .LBB80_2: +; GCN2-NEXT: s_cbranch_execz .LBB76_3 +; GCN2-NEXT: s_branch .LBB76_4 +; GCN2-NEXT: .LBB76_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB80_3: ; %atomicrmw.private +; GCN2-NEXT: .LBB76_3: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -10633,7 +10247,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB80_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB76_4: ; %atomicrmw.end ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -10644,7 +10258,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB80_2 +; GCN3-NEXT: s_cbranch_vccz .LBB76_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -10653,11 +10267,11 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB80_3 -; GCN3-NEXT: s_branch .LBB80_4 -; GCN3-NEXT: .LBB80_2: +; GCN3-NEXT: s_cbranch_execz .LBB76_3 +; GCN3-NEXT: s_branch .LBB76_4 +; GCN3-NEXT: .LBB76_2: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB80_3: ; %atomicrmw.private +; GCN3-NEXT: .LBB76_3: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -10669,7 +10283,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB80_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB76_4: ; %atomicrmw.end ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst @@ -10688,7 +10302,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB81_2 +; GCN1-NEXT: s_cbranch_vccz .LBB77_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v2, s6 @@ -10697,11 +10311,11 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB81_3 -; GCN1-NEXT: s_branch .LBB81_4 -; GCN1-NEXT: .LBB81_2: +; GCN1-NEXT: s_cbranch_execz .LBB77_3 +; GCN1-NEXT: s_branch .LBB77_4 +; GCN1-NEXT: .LBB77_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN1-NEXT: .LBB77_3: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -10716,7 +10330,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB77_4: ; %atomicrmw.end ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -10731,7 +10345,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB81_2 +; GCN2-NEXT: s_cbranch_vccz .LBB77_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v2, s6 @@ -10740,11 +10354,11 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB81_3 -; GCN2-NEXT: s_branch .LBB81_4 -; GCN2-NEXT: .LBB81_2: +; GCN2-NEXT: s_cbranch_execz .LBB77_3 +; GCN2-NEXT: s_branch .LBB77_4 +; GCN2-NEXT: .LBB77_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN2-NEXT: .LBB77_3: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -10758,7 +10372,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB77_4: ; %atomicrmw.end ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -10771,7 +10385,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB81_2 +; GCN3-NEXT: s_cbranch_vccz .LBB77_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 ; GCN3-NEXT: v_mov_b32_e32 v2, s6 @@ -10780,11 +10394,11 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB81_3 -; GCN3-NEXT: s_branch .LBB81_4 -; GCN3-NEXT: .LBB81_2: +; GCN3-NEXT: s_cbranch_execz .LBB77_3 +; GCN3-NEXT: s_branch .LBB77_4 +; GCN3-NEXT: .LBB77_2: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN3-NEXT: .LBB77_3: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -10796,7 +10410,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB77_4: ; %atomicrmw.end ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -10816,22 +10430,22 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB82_3 +; GCN1-NEXT: s_cbranch_execnz .LBB78_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB82_4 -; GCN1-NEXT: .LBB82_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB78_4 +; GCN1-NEXT: .LBB78_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB82_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB82_2 -; GCN1-NEXT: .LBB82_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB78_2 +; GCN1-NEXT: .LBB78_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -10858,22 +10472,22 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB82_3 +; GCN2-NEXT: s_cbranch_execnz .LBB78_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB82_4 -; GCN2-NEXT: .LBB82_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB78_4 +; GCN2-NEXT: .LBB78_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB82_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB82_2 -; GCN2-NEXT: .LBB82_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB78_2 +; GCN2-NEXT: .LBB78_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -10898,22 +10512,22 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB82_3 +; GCN3-NEXT: s_cbranch_execnz .LBB78_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB82_4 -; GCN3-NEXT: .LBB82_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB78_4 +; GCN3-NEXT: .LBB78_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB82_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB78_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB82_2 -; GCN3-NEXT: .LBB82_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB78_2 +; GCN3-NEXT: .LBB78_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -10945,22 +10559,22 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB83_3 +; GCN1-NEXT: s_cbranch_execnz .LBB79_3 ; GCN1-NEXT: ; %bb.1: ; %Flow ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB83_4 -; GCN1-NEXT: .LBB83_2: ; %atomicrmw.phi +; GCN1-NEXT: s_cbranch_execnz .LBB79_4 +; GCN1-NEXT: .LBB79_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB83_3: ; %atomicrmw.global +; GCN1-NEXT: .LBB79_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB83_2 -; GCN1-NEXT: .LBB83_4: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB79_2 +; GCN1-NEXT: .LBB79_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -10988,22 +10602,22 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB83_3 +; GCN2-NEXT: s_cbranch_execnz .LBB79_3 ; GCN2-NEXT: ; %bb.1: ; %Flow ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB83_4 -; GCN2-NEXT: .LBB83_2: ; %atomicrmw.phi +; GCN2-NEXT: s_cbranch_execnz .LBB79_4 +; GCN2-NEXT: .LBB79_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB83_3: ; %atomicrmw.global +; GCN2-NEXT: .LBB79_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB83_2 -; GCN2-NEXT: .LBB83_4: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB79_2 +; GCN2-NEXT: .LBB79_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -11029,22 +10643,22 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB83_3 +; GCN3-NEXT: s_cbranch_execnz .LBB79_3 ; GCN3-NEXT: ; %bb.1: ; %Flow ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB83_4 -; GCN3-NEXT: .LBB83_2: ; %atomicrmw.phi +; GCN3-NEXT: s_cbranch_execnz .LBB79_4 +; GCN3-NEXT: .LBB79_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] -; GCN3-NEXT: .LBB83_3: ; %atomicrmw.global +; GCN3-NEXT: .LBB79_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execz .LBB83_2 -; GCN3-NEXT: .LBB83_4: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB79_2 +; GCN3-NEXT: .LBB79_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -11071,12 +10685,26 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_max_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB80_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB80_6 +; GCN1-NEXT: .LBB80_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB80_3: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB80_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -11087,23 +10715,55 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB84_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB80_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB80_2 +; GCN1-NEXT: .LBB80_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB80_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB80_6 +; GCN2-NEXT: .LBB80_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB80_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB80_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -11114,20 +10774,50 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB84_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB80_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB80_2 +; GCN2-NEXT: .LBB80_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB80_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB80_6 +; GCN3-NEXT: .LBB80_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB80_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB80_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -11138,14 +10828,31 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB84_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB80_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB80_2 +; GCN3-NEXT: .LBB80_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst ret void } @@ -11153,86 +10860,181 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_max_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB81_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB81_6 +; GCN1-NEXT: .LBB81_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB81_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB81_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB85_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB81_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB81_2 +; GCN1-NEXT: .LBB81_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[0:1] -; GCN2-NEXT: flat_load_dword v6, v[8:9] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB81_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB81_6 +; GCN2-NEXT: .LBB81_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB81_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB81_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB85_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB81_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB81_2 +; GCN2-NEXT: .LBB81_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB81_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB81_6 +; GCN3-NEXT: .LBB81_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB81_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB81_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB85_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB81_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB81_2 +; GCN3-NEXT: .LBB81_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst ret void } @@ -11240,12 +11042,21 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_max_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB82_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: flat_load_dword v5, v[5:6] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB82_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -11257,24 +11068,53 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB86_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB82_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: .LBB82_4: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB82_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB82_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB82_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: flat_load_dword v5, v[5:6] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB82_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -11286,21 +11126,48 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB86_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB82_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: .LBB82_4: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB82_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB82_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB82_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB82_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -11312,15 +11179,34 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB86_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB82_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: .LBB82_4: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB82_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB82_6: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw max ptr %ptr, i64 %in seq_cst ret i64 %result } @@ -11328,43 +11214,91 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_max_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB83_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB83_6 +; GCN1-NEXT: .LBB83_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB83_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB83_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB87_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB83_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB83_2 +; GCN1-NEXT: .LBB83_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB83_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB83_6 +; GCN2-NEXT: .LBB83_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB83_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB83_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -11376,40 +11310,88 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB87_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB83_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB83_2 +; GCN2-NEXT: .LBB83_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB83_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB83_6 +; GCN3-NEXT: .LBB83_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB83_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB83_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB87_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB83_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB83_2 +; GCN3-NEXT: .LBB83_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw max ptr %gep, i64 %in seq_cst ret i64 %result } @@ -11417,20 +11399,32 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-LABEL: flat_atomic_max_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB84_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB84_6 +; GCN1-NEXT: .LBB84_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB84_3: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: v_mov_b32_e32 v4, s35 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -11444,28 +11438,59 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB88_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB84_4 +; GCN1-NEXT: ; %bb.5: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB84_2 +; GCN1-NEXT: .LBB84_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB84_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB84_6 +; GCN2-NEXT: .LBB84_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB84_3: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s34, s4, 4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: v_mov_b32_e32 v4, s35 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -11479,23 +11504,51 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB88_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB84_4 +; GCN2-NEXT: ; %bb.5: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB84_2 +; GCN2-NEXT: .LBB84_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB84_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB84_6 +; GCN3-NEXT: .LBB84_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB84_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -11509,11 +11562,27 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB88_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_cbranch_execnz .LBB84_4 +; GCN3-NEXT: ; %bb.5: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB84_2 +; GCN3-NEXT: .LBB84_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst ret void } @@ -11521,20 +11590,34 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_max_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: s_add_u32 s36, s4, 36 -; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB85_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB85_6 +; GCN1-NEXT: .LBB85_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB85_3: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s36 ; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -11545,31 +11628,64 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB89_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB85_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB85_2 +; GCN1-NEXT: .LBB85_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: s_add_u32 s36, s4, 36 -; GCN2-NEXT: s_addc_u32 s37, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s36 -; GCN2-NEXT: v_mov_b32_e32 v1, s37 -; GCN2-NEXT: v_mov_b32_e32 v4, s34 -; GCN2-NEXT: v_mov_b32_e32 v5, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: flat_load_dword v2, v[4:5] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB85_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB85_6 +; GCN2-NEXT: .LBB85_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB85_3: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -11580,45 +11696,91 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB89_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB85_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB85_2 +; GCN2-NEXT: .LBB85_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB85_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB85_6 +; GCN3-NEXT: .LBB85_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB85_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB89_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB85_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB85_2 +; GCN3-NEXT: .LBB85_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst ret void } @@ -11626,20 +11788,26 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_max_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB86_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -11653,28 +11821,57 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB90_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB86_2 +; GCN1-NEXT: ; %bb.3: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB86_6 +; GCN1-NEXT: .LBB86_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB86_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB86_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB86_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s34, s4, 4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -11688,23 +11885,49 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB90_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB86_2 +; GCN2-NEXT: ; %bb.3: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB86_6 +; GCN2-NEXT: .LBB86_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB86_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB86_6: ; %atomicrmw.phi +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB86_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -11718,11 +11941,31 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB90_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_cbranch_execnz .LBB86_2 +; GCN3-NEXT: ; %bb.3: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB86_6 +; GCN3-NEXT: .LBB86_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB86_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB86_6: ; %atomicrmw.phi +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw max ptr %ptr, i64 %in seq_cst ret i64 %result } @@ -11730,20 +11973,28 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-LABEL: flat_atomic_max_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: s_add_u32 s36, s4, 36 -; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB87_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s36 ; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -11755,30 +12006,61 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB91_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB87_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB87_6 +; GCN1-NEXT: .LBB87_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB87_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB87_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: s_add_u32 s36, s4, 36 -; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB87_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s36 ; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -11790,25 +12072,53 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB91_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB87_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB87_6 +; GCN2-NEXT: .LBB87_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB87_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB87_6: ; %atomicrmw.phi +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB87_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -11816,39 +12126,77 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB91_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 - ret i64 %result -} - -define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { -; GCN1-LABEL: atomic_max_i64_addr64_offset: -; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB87_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB87_6 +; GCN3-NEXT: .LBB87_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB87_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB87_6: ; %atomicrmw.phi +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw max ptr %gep, i64 %in seq_cst + ret i64 %result +} + +define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_max_i64_addr64_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s11 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB88_3 +; GCN1-NEXT: ; %bb.1: ; %Flow6 +; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccnz .LBB88_6 +; GCN1-NEXT: .LBB88_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB88_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -11859,30 +12207,67 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB92_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB88_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_branch .LBB88_2 +; GCN1-NEXT: .LBB88_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB88_3 +; GCN2-NEXT: ; %bb.1: ; %Flow6 +; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccnz .LBB88_6 +; GCN2-NEXT: .LBB88_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB88_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -11893,164 +12278,318 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB92_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB88_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_branch .LBB88_2 +; GCN2-NEXT: .LBB88_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_mov_b32 s14, -1 +; GCN3-NEXT: s_mov_b32 s15, 0xe00000 +; GCN3-NEXT: s_add_u32 s12, s12, s11 +; GCN3-NEXT: s_addc_u32 s13, s13, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN3-NEXT: s_add_u32 s0, s0, s6 +; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_add_u32 s0, s0, 32 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: s_addc_u32 s1, s1, 0 +; GCN3-NEXT: s_cmp_eq_u32 s1, s5 +; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN3-NEXT: s_mov_b64 s[4:5], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB88_3 +; GCN3-NEXT: ; %bb.1: ; %Flow6 +; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN3-NEXT: s_cbranch_vccnz .LBB88_6 +; GCN3-NEXT: .LBB88_2: ; %atomicrmw.phi +; GCN3-NEXT: s_endpgm +; GCN3-NEXT: .LBB88_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v5, s1 ; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s3 +; GCN3-NEXT: v_mov_b32_e32 v7, s2 +; GCN3-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB92_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB88_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_branch .LBB88_2 +; GCN3-NEXT: .LBB88_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN3-NEXT: s_cselect_b32 s0, s0, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s2 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst ret void } define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s6 -; GCN1-NEXT: s_addc_u32 s1, s1, s7 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s5 -; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB89_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v3 -; GCN1-NEXT: v_mov_b32_e32 v8, v2 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB93_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_cbranch_execnz .LBB89_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_branch .LBB89_6 +; GCN1-NEXT: .LBB89_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB89_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen +; GCN1-NEXT: .LBB89_6: ; %atomicrmw.phi +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s6 -; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s5 -; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB89_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v3 -; GCN2-NEXT: v_mov_b32_e32 v8, v2 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB93_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_cbranch_execnz .LBB89_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_branch .LBB89_6 +; GCN2-NEXT: .LBB89_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB89_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB89_6: ; %atomicrmw.phi +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: s_mov_b32 s18, -1 +; GCN3-NEXT: s_mov_b32 s19, 0xe00000 +; GCN3-NEXT: s_add_u32 s16, s16, s11 +; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN3-NEXT: s_addc_u32 s17, s17, 0 +; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s9 -; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN3-NEXT: s_add_u32 s0, s8, s0 +; GCN3-NEXT: s_addc_u32 s1, s9, s1 +; GCN3-NEXT: s_add_u32 s0, s0, 32 +; GCN3-NEXT: s_addc_u32 s1, s1, 0 +; GCN3-NEXT: s_cmp_eq_u32 s1, s3 +; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN3-NEXT: s_cbranch_vccz .LBB89_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s13 +; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v3 -; GCN3-NEXT: v_mov_b32_e32 v8, v2 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9] +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB93_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_cbranch_execnz .LBB89_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_branch .LBB89_6 +; GCN3-NEXT: .LBB89_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB89_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN3-NEXT: s_cselect_b32 s0, s0, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s12 +; GCN3-NEXT: v_mov_b32_e32 v3, s13 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4 +; GCN3-NEXT: .LBB89_6: ; %atomicrmw.phi +; GCN3-NEXT: v_mov_b32_e32 v2, s10 +; GCN3-NEXT: v_mov_b32_e32 v3, s11 +; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst store i64 %tmp0, ptr %out2 ret void } @@ -12058,19 +12597,37 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s11 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB90_3 +; GCN1-NEXT: ; %bb.1: ; %Flow6 +; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccnz .LBB90_6 +; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -12081,28 +12638,65 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB94_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB90_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_branch .LBB90_2 +; GCN1-NEXT: .LBB90_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB90_3 +; GCN2-NEXT: ; %bb.1: ; %Flow6 +; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccnz .LBB90_6 +; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -12113,31 +12707,67 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB94_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB90_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_branch .LBB90_2 +; GCN2-NEXT: .LBB90_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i64_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_mov_b32 s14, -1 +; GCN3-NEXT: s_mov_b32 s15, 0xe00000 +; GCN3-NEXT: s_add_u32 s12, s12, s11 +; GCN3-NEXT: s_addc_u32 s13, s13, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN3-NEXT: s_add_u32 s0, s0, s6 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_cmp_eq_u32 s1, s5 +; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN3-NEXT: s_mov_b64 s[4:5], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB90_3 +; GCN3-NEXT: ; %bb.1: ; %Flow6 +; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN3-NEXT: s_cbranch_vccnz .LBB90_6 +; GCN3-NEXT: .LBB90_2: ; %atomicrmw.phi +; GCN3-NEXT: s_endpgm +; GCN3-NEXT: .LBB90_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v5, s1 ; GCN3-NEXT: v_mov_b32_e32 v4, s0 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s3 +; GCN3-NEXT: v_mov_b32_e32 v7, s2 +; GCN3-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -12145,126 +12775,240 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB94_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB90_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_branch .LBB90_2 +; GCN3-NEXT: .LBB90_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN3-NEXT: s_cselect_b32 s0, s0, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s2 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst ret void } define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s6 -; GCN1-NEXT: s_addc_u32 s1, s1, s7 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s5 -; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB91_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v3 -; GCN1-NEXT: v_mov_b32_e32 v8, v2 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB95_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_cbranch_execnz .LBB91_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_branch .LBB91_6 +; GCN1-NEXT: .LBB91_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB91_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen +; GCN1-NEXT: .LBB91_6: ; %atomicrmw.phi +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s6 -; GCN2-NEXT: s_addc_u32 s1, s1, s7 -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s5 -; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB91_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v3 -; GCN2-NEXT: v_mov_b32_e32 v8, v2 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB95_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_cbranch_execnz .LBB91_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_branch .LBB91_6 +; GCN2-NEXT: .LBB91_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB91_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB91_6: ; %atomicrmw.phi +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: s_mov_b32 s18, -1 +; GCN3-NEXT: s_mov_b32 s19, 0xe00000 +; GCN3-NEXT: s_add_u32 s16, s16, s11 +; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN3-NEXT: s_addc_u32 s17, s17, 0 +; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s9 -; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN3-NEXT: s_add_u32 s0, s8, s0 +; GCN3-NEXT: s_addc_u32 s1, s9, s1 +; GCN3-NEXT: s_cmp_eq_u32 s1, s3 +; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN3-NEXT: s_cbranch_vccz .LBB91_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s13 +; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v3 -; GCN3-NEXT: v_mov_b32_e32 v8, v2 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9] +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB95_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_cbranch_execnz .LBB91_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_branch .LBB91_6 +; GCN3-NEXT: .LBB91_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB91_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN3-NEXT: s_cselect_b32 s0, s0, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s12 +; GCN3-NEXT: v_mov_b32_e32 v3, s13 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4 +; GCN3-NEXT: .LBB91_6: ; %atomicrmw.phi +; GCN3-NEXT: v_mov_b32_e32 v2, s10 +; GCN3-NEXT: v_mov_b32_e32 v3, s11 +; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst store i64 %tmp0, ptr %out2 ret void } @@ -12273,86 +13017,181 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB92_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB92_6 +; GCN1-NEXT: .LBB92_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB92_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB92_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB96_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB92_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB92_2 +; GCN1-NEXT: .LBB92_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[0:1] -; GCN2-NEXT: flat_load_dword v6, v[8:9] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB92_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB92_6 +; GCN2-NEXT: .LBB92_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB92_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB92_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB96_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB92_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB92_2 +; GCN2-NEXT: .LBB92_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB92_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB92_6 +; GCN3-NEXT: .LBB92_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB92_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB92_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB96_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB92_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB92_2 +; GCN3-NEXT: .LBB92_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -12360,14 +13199,29 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB93_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB93_6 +; GCN1-NEXT: .LBB93_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB93_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB93_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -12379,24 +13233,57 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB97_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB93_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB93_2 +; GCN1-NEXT: .LBB93_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB93_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB93_6 +; GCN2-NEXT: .LBB93_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB93_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB93_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -12408,40 +13295,88 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB97_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB93_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB93_2 +; GCN2-NEXT: .LBB93_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB93_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB93_6 +; GCN3-NEXT: .LBB93_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB93_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB93_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB97_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB93_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB93_2 +; GCN3-NEXT: .LBB93_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -12453,12 +13388,26 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_umax_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB94_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB94_6 +; GCN1-NEXT: .LBB94_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB94_3: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB94_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -12469,23 +13418,55 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB98_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB94_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB94_2 +; GCN1-NEXT: .LBB94_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB94_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB94_6 +; GCN2-NEXT: .LBB94_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB94_3: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB94_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -12496,20 +13477,50 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB98_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB94_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB94_2 +; GCN2-NEXT: .LBB94_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB94_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB94_6 +; GCN3-NEXT: .LBB94_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB94_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB94_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -12520,14 +13531,31 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB98_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB94_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB94_2 +; GCN3-NEXT: .LBB94_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst ret void } @@ -12535,86 +13563,181 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_umax_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB95_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB95_6 +; GCN1-NEXT: .LBB95_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB95_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB95_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB99_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB95_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB95_2 +; GCN1-NEXT: .LBB95_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[0:1] -; GCN2-NEXT: flat_load_dword v6, v[8:9] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB95_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB95_6 +; GCN2-NEXT: .LBB95_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB95_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB95_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB99_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB95_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB95_2 +; GCN2-NEXT: .LBB95_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB95_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB95_6 +; GCN3-NEXT: .LBB95_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB95_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB95_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB99_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB95_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB95_2 +; GCN3-NEXT: .LBB95_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst ret void } @@ -12622,12 +13745,21 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_umax_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB96_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: flat_load_dword v5, v[5:6] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB96_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -12639,24 +13771,53 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB100_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB96_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: .LBB96_4: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB96_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB96_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB96_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: flat_load_dword v5, v[5:6] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB96_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -12668,21 +13829,48 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB100_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB96_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: .LBB96_4: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB96_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB96_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB96_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB96_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -12694,15 +13882,34 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB100_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB96_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: .LBB96_4: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB96_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB96_6: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw umax ptr %ptr, i64 %in seq_cst ret i64 %result } @@ -12710,14 +13917,29 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_umax_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB97_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB97_6 +; GCN1-NEXT: .LBB97_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB97_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB97_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -12729,24 +13951,57 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB101_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB97_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB97_2 +; GCN1-NEXT: .LBB97_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB97_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB97_6 +; GCN2-NEXT: .LBB97_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB97_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB97_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -12758,40 +14013,88 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB101_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB97_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB97_2 +; GCN2-NEXT: .LBB97_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB97_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB97_6 +; GCN3-NEXT: .LBB97_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB97_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB97_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB101_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB97_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB97_2 +; GCN3-NEXT: .LBB97_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst ret i64 %result } @@ -12799,20 +14102,32 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-LABEL: flat_atomic_umax_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB98_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB98_6 +; GCN1-NEXT: .LBB98_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB98_3: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: v_mov_b32_e32 v4, s35 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -12826,28 +14141,59 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB102_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB98_4 +; GCN1-NEXT: ; %bb.5: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: flat_atomic_umax_i64_noret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s34, s4, 4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_branch .LBB98_2 +; GCN1-NEXT: .LBB98_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i64_noret_scalar: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB98_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB98_6 +; GCN2-NEXT: .LBB98_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB98_3: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s34, s4, 4 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: v_mov_b32_e32 v4, s35 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -12861,23 +14207,51 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB102_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB98_4 +; GCN2-NEXT: ; %bb.5: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB98_2 +; GCN2-NEXT: .LBB98_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB98_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB98_6 +; GCN3-NEXT: .LBB98_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB98_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -12891,11 +14265,27 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB102_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_cbranch_execnz .LBB98_4 +; GCN3-NEXT: ; %bb.5: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB98_2 +; GCN3-NEXT: .LBB98_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst ret void } @@ -12903,20 +14293,34 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_umax_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: s_add_u32 s36, s4, 36 -; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB99_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB99_6 +; GCN1-NEXT: .LBB99_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB99_3: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s36 ; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -12927,31 +14331,64 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB103_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB99_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB99_2 +; GCN1-NEXT: .LBB99_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: s_add_u32 s36, s4, 36 -; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB99_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB99_6 +; GCN2-NEXT: .LBB99_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB99_3: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s36 ; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -12962,45 +14399,91 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB103_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB99_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB99_2 +; GCN2-NEXT: .LBB99_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB99_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB99_6 +; GCN3-NEXT: .LBB99_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB99_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB103_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB99_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB99_2 +; GCN3-NEXT: .LBB99_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst ret void } @@ -13008,20 +14491,26 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_umax_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB100_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -13035,28 +14524,57 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB104_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB100_2 +; GCN1-NEXT: ; %bb.3: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB100_6 +; GCN1-NEXT: .LBB100_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB100_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB100_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB100_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s34, s4, 4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -13070,23 +14588,49 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB104_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB100_2 +; GCN2-NEXT: ; %bb.3: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB100_6 +; GCN2-NEXT: .LBB100_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB100_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB100_6: ; %atomicrmw.phi +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB100_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -13100,32 +14644,60 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB104_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_cbranch_execnz .LBB100_2 +; GCN3-NEXT: ; %bb.3: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 - ret i64 %result -} - -define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { -; GCN1-LABEL: flat_atomic_umax_i64_ret_offset_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_branch .LBB100_6 +; GCN3-NEXT: .LBB100_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB100_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB100_6: ; %atomicrmw.phi +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw umax ptr %ptr, i64 %in seq_cst + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) { +; GCN1-LABEL: flat_atomic_umax_i64_ret_offset_scalar: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: s_add_u32 s36, s4, 36 -; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB101_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s36 ; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -13137,30 +14709,61 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB105_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB101_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB101_6 +; GCN1-NEXT: .LBB101_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB101_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB101_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: s_add_u32 s36, s4, 36 -; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB101_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s36 ; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -13172,25 +14775,53 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB105_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB101_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB101_6 +; GCN2-NEXT: .LBB101_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB101_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB101_6: ; %atomicrmw.phi +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB101_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -13198,39 +14829,77 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB105_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB101_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB101_6 +; GCN3-NEXT: .LBB101_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB101_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB101_6: ; %atomicrmw.phi +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst ret i64 %result } define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s11 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB102_3 +; GCN1-NEXT: ; %bb.1: ; %Flow6 +; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccnz .LBB102_6 +; GCN1-NEXT: .LBB102_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB102_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -13241,30 +14910,67 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB106_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB102_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_branch .LBB102_2 +; GCN1-NEXT: .LBB102_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB102_3 +; GCN2-NEXT: ; %bb.1: ; %Flow6 +; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccnz .LBB102_6 +; GCN2-NEXT: .LBB102_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB102_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -13275,164 +14981,318 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB106_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB102_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_branch .LBB102_2 +; GCN2-NEXT: .LBB102_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_mov_b32 s14, -1 +; GCN3-NEXT: s_mov_b32 s15, 0xe00000 +; GCN3-NEXT: s_add_u32 s12, s12, s11 +; GCN3-NEXT: s_addc_u32 s13, s13, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN3-NEXT: s_add_u32 s0, s0, s6 +; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_add_u32 s0, s0, 32 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: s_addc_u32 s1, s1, 0 +; GCN3-NEXT: s_cmp_eq_u32 s1, s5 +; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN3-NEXT: s_mov_b64 s[4:5], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB102_3 +; GCN3-NEXT: ; %bb.1: ; %Flow6 +; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN3-NEXT: s_cbranch_vccnz .LBB102_6 +; GCN3-NEXT: .LBB102_2: ; %atomicrmw.phi +; GCN3-NEXT: s_endpgm +; GCN3-NEXT: .LBB102_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v5, s1 ; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s3 +; GCN3-NEXT: v_mov_b32_e32 v7, s2 +; GCN3-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB106_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB102_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_branch .LBB102_2 +; GCN3-NEXT: .LBB102_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN3-NEXT: s_cselect_b32 s0, s0, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s2 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst ret void } define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s6 -; GCN1-NEXT: s_addc_u32 s1, s1, s7 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s5 -; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB103_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v3 -; GCN1-NEXT: v_mov_b32_e32 v8, v2 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB107_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_cbranch_execnz .LBB103_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_branch .LBB103_6 +; GCN1-NEXT: .LBB103_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB103_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen +; GCN1-NEXT: .LBB103_6: ; %atomicrmw.phi +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s6 -; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s5 -; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB103_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v3 -; GCN2-NEXT: v_mov_b32_e32 v8, v2 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB107_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_cbranch_execnz .LBB103_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_branch .LBB103_6 +; GCN2-NEXT: .LBB103_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB103_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB103_6: ; %atomicrmw.phi +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: s_mov_b32 s18, -1 +; GCN3-NEXT: s_mov_b32 s19, 0xe00000 +; GCN3-NEXT: s_add_u32 s16, s16, s11 +; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN3-NEXT: s_addc_u32 s17, s17, 0 +; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s9 -; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN3-NEXT: s_add_u32 s0, s8, s0 +; GCN3-NEXT: s_addc_u32 s1, s9, s1 +; GCN3-NEXT: s_add_u32 s0, s0, 32 +; GCN3-NEXT: s_addc_u32 s1, s1, 0 +; GCN3-NEXT: s_cmp_eq_u32 s1, s3 +; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN3-NEXT: s_cbranch_vccz .LBB103_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s13 +; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v3 -; GCN3-NEXT: v_mov_b32_e32 v8, v2 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9] +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB107_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_cbranch_execnz .LBB103_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_branch .LBB103_6 +; GCN3-NEXT: .LBB103_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB103_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN3-NEXT: s_cselect_b32 s0, s0, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s12 +; GCN3-NEXT: v_mov_b32_e32 v3, s13 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4 +; GCN3-NEXT: .LBB103_6: ; %atomicrmw.phi +; GCN3-NEXT: v_mov_b32_e32 v2, s10 +; GCN3-NEXT: v_mov_b32_e32 v3, s11 +; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst store i64 %tmp0, ptr %out2 ret void } @@ -13440,111 +15300,209 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s6 -; GCN1-NEXT: s_addc_u32 s1, s1, s7 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s5 -; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB104_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v3 -; GCN1-NEXT: v_mov_b32_e32 v8, v2 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB108_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_cbranch_execnz .LBB104_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_branch .LBB104_6 +; GCN1-NEXT: .LBB104_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB104_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen +; GCN1-NEXT: .LBB104_6: ; %atomicrmw.phi +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s6 -; GCN2-NEXT: s_addc_u32 s1, s1, s7 -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s5 -; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB104_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v3 -; GCN2-NEXT: v_mov_b32_e32 v8, v2 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB108_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_cbranch_execnz .LBB104_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_branch .LBB104_6 +; GCN2-NEXT: .LBB104_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB104_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB104_6: ; %atomicrmw.phi +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: s_mov_b32 s18, -1 +; GCN3-NEXT: s_mov_b32 s19, 0xe00000 +; GCN3-NEXT: s_add_u32 s16, s16, s11 +; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN3-NEXT: s_addc_u32 s17, s17, 0 +; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s9 -; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN3-NEXT: s_add_u32 s0, s8, s0 +; GCN3-NEXT: s_addc_u32 s1, s9, s1 +; GCN3-NEXT: s_cmp_eq_u32 s1, s3 +; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN3-NEXT: s_cbranch_vccz .LBB104_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s13 +; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v3 -; GCN3-NEXT: v_mov_b32_e32 v8, v2 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9] +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB108_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_cbranch_execnz .LBB104_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_branch .LBB104_6 +; GCN3-NEXT: .LBB104_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB104_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN3-NEXT: s_cselect_b32 s0, s0, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s12 +; GCN3-NEXT: v_mov_b32_e32 v3, s13 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4 +; GCN3-NEXT: .LBB104_6: ; %atomicrmw.phi +; GCN3-NEXT: v_mov_b32_e32 v2, s10 +; GCN3-NEXT: v_mov_b32_e32 v3, s11 +; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst store i64 %tmp0, ptr %out2 ret void } @@ -13553,86 +15511,181 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB105_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB105_6 +; GCN1-NEXT: .LBB105_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB105_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB105_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB109_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB105_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB105_2 +; GCN1-NEXT: .LBB105_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[0:1] -; GCN2-NEXT: flat_load_dword v6, v[8:9] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB105_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB105_6 +; GCN2-NEXT: .LBB105_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB105_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB105_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB109_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB105_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB105_2 +; GCN2-NEXT: .LBB105_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB105_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB105_6 +; GCN3-NEXT: .LBB105_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB105_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB105_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB109_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB105_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB105_2 +; GCN3-NEXT: .LBB105_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -13640,14 +15693,29 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB106_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB106_6 +; GCN1-NEXT: .LBB106_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB106_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB106_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -13659,24 +15727,57 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB110_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB106_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB106_2 +; GCN1-NEXT: .LBB106_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB106_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB106_6 +; GCN2-NEXT: .LBB106_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB106_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB106_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -13688,40 +15789,88 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB110_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB106_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB106_2 +; GCN2-NEXT: .LBB106_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB106_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB106_6 +; GCN3-NEXT: .LBB106_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB106_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB106_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB110_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB106_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB106_2 +; GCN3-NEXT: .LBB106_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -13733,12 +15882,26 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_umin_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB107_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB107_6 +; GCN1-NEXT: .LBB107_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB107_3: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB107_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -13749,23 +15912,55 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB111_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB107_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB107_2 +; GCN1-NEXT: .LBB107_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB107_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB107_6 +; GCN2-NEXT: .LBB107_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB107_3: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB107_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -13776,20 +15971,50 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB111_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB107_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB107_2 +; GCN2-NEXT: .LBB107_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB107_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB107_6 +; GCN3-NEXT: .LBB107_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB107_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB107_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -13800,14 +16025,31 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB111_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB107_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB107_2 +; GCN3-NEXT: .LBB107_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst ret void } @@ -13815,86 +16057,181 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_umin_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB108_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB108_6 +; GCN1-NEXT: .LBB108_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB108_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB108_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB112_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB108_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB108_2 +; GCN1-NEXT: .LBB108_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[0:1] -; GCN2-NEXT: flat_load_dword v6, v[8:9] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB108_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB108_6 +; GCN2-NEXT: .LBB108_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB108_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB108_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB112_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB108_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB108_2 +; GCN2-NEXT: .LBB108_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB108_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB108_6 +; GCN3-NEXT: .LBB108_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB108_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB108_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB112_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB108_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB108_2 +; GCN3-NEXT: .LBB108_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst ret void } @@ -13902,12 +16239,21 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_umin_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB109_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: flat_load_dword v5, v[5:6] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB109_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -13919,24 +16265,53 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB113_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB109_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: .LBB109_4: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB109_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[4:5], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB109_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB109_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: flat_load_dword v5, v[5:6] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB109_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -13948,21 +16323,48 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB113_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB109_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: .LBB109_4: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB109_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[4:5], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB109_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB109_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB109_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -13974,15 +16376,34 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB113_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB109_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: .LBB109_4: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB109_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[4:5], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB109_6: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw umin ptr %ptr, i64 %in seq_cst ret i64 %result } @@ -13990,14 +16411,29 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_umin_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB110_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB110_6 +; GCN1-NEXT: .LBB110_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB110_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB110_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -14009,24 +16445,57 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB114_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB110_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB110_2 +; GCN1-NEXT: .LBB110_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB110_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB110_6 +; GCN2-NEXT: .LBB110_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB110_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB110_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -14038,40 +16507,88 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB114_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB110_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB110_2 +; GCN2-NEXT: .LBB110_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB110_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB110_6 +; GCN3-NEXT: .LBB110_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB110_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB110_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB114_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB110_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB110_2 +; GCN3-NEXT: .LBB110_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst ret i64 %result } @@ -14079,20 +16596,32 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-LABEL: flat_atomic_umin_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB111_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB111_6 +; GCN1-NEXT: .LBB111_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB111_3: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: v_mov_b32_e32 v4, s35 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -14106,28 +16635,59 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB115_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB111_4 +; GCN1-NEXT: ; %bb.5: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB111_2 +; GCN1-NEXT: .LBB111_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB111_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB111_6 +; GCN2-NEXT: .LBB111_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB111_3: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s34, s4, 4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: v_mov_b32_e32 v4, s35 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -14141,23 +16701,51 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB115_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB111_4 +; GCN2-NEXT: ; %bb.5: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB111_2 +; GCN2-NEXT: .LBB111_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB111_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB111_6 +; GCN3-NEXT: .LBB111_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB111_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -14171,11 +16759,27 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB115_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_cbranch_execnz .LBB111_4 +; GCN3-NEXT: ; %bb.5: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB111_2 +; GCN3-NEXT: .LBB111_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst ret void } @@ -14183,20 +16787,34 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_umin_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: s_add_u32 s36, s4, 36 -; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB112_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB112_6 +; GCN1-NEXT: .LBB112_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB112_3: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s36 ; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -14207,31 +16825,64 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB116_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB112_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB112_2 +; GCN1-NEXT: .LBB112_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: s_add_u32 s36, s4, 36 -; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB112_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB112_6 +; GCN2-NEXT: .LBB112_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB112_3: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s36 ; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -14242,45 +16893,91 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB116_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB112_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB112_2 +; GCN2-NEXT: .LBB112_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB112_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB112_6 +; GCN3-NEXT: .LBB112_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB112_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB116_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB112_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB112_2 +; GCN3-NEXT: .LBB112_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst ret void } @@ -14288,20 +16985,26 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_umin_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB113_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -14315,28 +17018,57 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB117_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB113_2 +; GCN1-NEXT: ; %bb.3: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: flat_atomic_umin_i64_ret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s34, s4, 4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: flat_load_dword v1, v[2:3] -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: s_branch .LBB113_6 +; GCN1-NEXT: .LBB113_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB113_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB113_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umin_i64_ret_scalar: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB113_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -14350,23 +17082,49 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB117_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB113_2 +; GCN2-NEXT: ; %bb.3: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB113_6 +; GCN2-NEXT: .LBB113_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB113_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB113_6: ; %atomicrmw.phi +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB113_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -14380,11 +17138,31 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB117_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_cbranch_execnz .LBB113_2 +; GCN3-NEXT: ; %bb.3: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB113_6 +; GCN3-NEXT: .LBB113_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB113_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB113_6: ; %atomicrmw.phi +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw umin ptr %ptr, i64 %in seq_cst ret i64 %result } @@ -14392,20 +17170,28 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-LABEL: flat_atomic_umin_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: s_add_u32 s36, s4, 36 -; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB114_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s36 ; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -14417,30 +17203,61 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB118_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB114_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB114_6 +; GCN1-NEXT: .LBB114_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB114_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB114_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: s_add_u32 s36, s4, 36 -; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB114_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s36 ; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -14452,25 +17269,53 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB118_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB114_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB114_6 +; GCN2-NEXT: .LBB114_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB114_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB114_6: ; %atomicrmw.phi +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB114_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -14478,18 +17323,38 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB118_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB114_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB114_6 +; GCN3-NEXT: .LBB114_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB114_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB114_6: ; %atomicrmw.phi +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst ret i64 %result } @@ -14497,86 +17362,181 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB115_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB115_6 +; GCN1-NEXT: .LBB115_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB115_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB115_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB119_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB115_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB115_2 +; GCN1-NEXT: .LBB115_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[0:1] -; GCN2-NEXT: flat_load_dword v6, v[8:9] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB115_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB115_6 +; GCN2-NEXT: .LBB115_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB115_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB115_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB119_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB115_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB115_2 +; GCN2-NEXT: .LBB115_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB115_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB115_6 +; GCN3-NEXT: .LBB115_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB115_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB115_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB119_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB115_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB115_2 +; GCN3-NEXT: .LBB115_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -14584,14 +17544,29 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB116_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB116_6 +; GCN1-NEXT: .LBB116_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB116_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB116_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -14603,24 +17578,57 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB120_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB116_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB116_2 +; GCN1-NEXT: .LBB116_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB116_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB116_6 +; GCN2-NEXT: .LBB116_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB116_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB116_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -14632,40 +17640,88 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB120_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB116_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB116_2 +; GCN2-NEXT: .LBB116_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB116_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB116_6 +; GCN3-NEXT: .LBB116_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB116_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB116_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB120_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB116_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB116_2 +; GCN3-NEXT: .LBB116_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -14677,12 +17733,26 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_min_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB117_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB117_6 +; GCN1-NEXT: .LBB117_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB117_3: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB117_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -14693,23 +17763,55 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB121_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB117_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB117_2 +; GCN1-NEXT: .LBB117_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB117_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB117_6 +; GCN2-NEXT: .LBB117_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB117_3: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB117_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -14720,20 +17822,50 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB121_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB117_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB117_2 +; GCN2-NEXT: .LBB117_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB117_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB117_6 +; GCN3-NEXT: .LBB117_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB117_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB117_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -14744,14 +17876,31 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB121_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB117_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB117_2 +; GCN3-NEXT: .LBB117_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst ret void } @@ -14759,86 +17908,181 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_min_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB118_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB118_6 +; GCN1-NEXT: .LBB118_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB118_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB118_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB122_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB118_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB118_2 +; GCN1-NEXT: .LBB118_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[0:1] -; GCN2-NEXT: flat_load_dword v6, v[8:9] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB118_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB118_6 +; GCN2-NEXT: .LBB118_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB118_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB118_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB122_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB118_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB118_2 +; GCN2-NEXT: .LBB118_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB118_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB118_6 +; GCN3-NEXT: .LBB118_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB118_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB118_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB122_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB118_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB118_2 +; GCN3-NEXT: .LBB118_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst ret void } @@ -14846,12 +18090,21 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_min_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB119_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: flat_load_dword v5, v[5:6] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB119_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -14863,24 +18116,53 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB123_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB119_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: .LBB119_4: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB119_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[4:5], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB119_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB119_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: flat_load_dword v5, v[5:6] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB119_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -14892,21 +18174,48 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB123_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB119_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: .LBB119_4: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB119_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[4:5], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB119_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB119_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB119_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -14918,15 +18227,34 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB123_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB119_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: .LBB119_4: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB119_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[4:5], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB119_6: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw min ptr %ptr, i64 %in seq_cst ret i64 %result } @@ -14934,14 +18262,29 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_min_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB120_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB120_6 +; GCN1-NEXT: .LBB120_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB120_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB120_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -14953,24 +18296,57 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB124_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB120_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB120_2 +; GCN1-NEXT: .LBB120_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB120_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB120_6 +; GCN2-NEXT: .LBB120_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB120_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB120_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -14982,40 +18358,88 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB124_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB120_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB120_2 +; GCN2-NEXT: .LBB120_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB120_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB120_6 +; GCN3-NEXT: .LBB120_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB120_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB120_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB124_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB120_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB120_2 +; GCN3-NEXT: .LBB120_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw min ptr %gep, i64 %in seq_cst ret i64 %result } @@ -15023,20 +18447,32 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-LABEL: flat_atomic_min_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB121_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB121_6 +; GCN1-NEXT: .LBB121_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB121_3: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: v_mov_b32_e32 v4, s35 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -15050,28 +18486,59 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB125_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB121_4 +; GCN1-NEXT: ; %bb.5: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB121_2 +; GCN1-NEXT: .LBB121_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB121_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB121_6 +; GCN2-NEXT: .LBB121_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB121_3: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s34, s4, 4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: v_mov_b32_e32 v4, s35 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -15085,23 +18552,51 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB125_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB121_4 +; GCN2-NEXT: ; %bb.5: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB121_2 +; GCN2-NEXT: .LBB121_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB121_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB121_6 +; GCN3-NEXT: .LBB121_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB121_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -15115,11 +18610,27 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB125_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_cbranch_execnz .LBB121_4 +; GCN3-NEXT: ; %bb.5: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB121_2 +; GCN3-NEXT: .LBB121_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst ret void } @@ -15127,20 +18638,34 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_min_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: s_add_u32 s36, s4, 36 -; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB122_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB122_6 +; GCN1-NEXT: .LBB122_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB122_3: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s36 ; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -15151,31 +18676,64 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB126_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB122_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB122_2 +; GCN1-NEXT: .LBB122_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: s_add_u32 s36, s4, 36 -; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB122_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB122_6 +; GCN2-NEXT: .LBB122_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB122_3: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s36 ; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -15186,45 +18744,91 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB126_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB122_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB122_2 +; GCN2-NEXT: .LBB122_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB122_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB122_6 +; GCN3-NEXT: .LBB122_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB122_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB126_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB122_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB122_2 +; GCN3-NEXT: .LBB122_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst ret void } @@ -15232,20 +18836,26 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_min_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB123_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -15259,28 +18869,57 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB127_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB123_2 +; GCN1-NEXT: ; %bb.3: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB123_6 +; GCN1-NEXT: .LBB123_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB123_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB123_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB123_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s34, s4, 4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -15294,23 +18933,49 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB127_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB123_2 +; GCN2-NEXT: ; %bb.3: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB123_6 +; GCN2-NEXT: .LBB123_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB123_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB123_6: ; %atomicrmw.phi +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB123_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -15324,11 +18989,31 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB127_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_cbranch_execnz .LBB123_2 +; GCN3-NEXT: ; %bb.3: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB123_6 +; GCN3-NEXT: .LBB123_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB123_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB123_6: ; %atomicrmw.phi +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw min ptr %ptr, i64 %in seq_cst ret i64 %result } @@ -15336,20 +19021,28 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-LABEL: flat_atomic_min_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: s_add_u32 s36, s4, 36 -; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB124_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s36 ; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -15361,30 +19054,61 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB128_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB124_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB124_6 +; GCN1-NEXT: .LBB124_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB124_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB124_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: s_add_u32 s36, s4, 36 -; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB124_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s36 ; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -15396,25 +19120,53 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB128_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB124_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB124_6 +; GCN2-NEXT: .LBB124_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB124_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB124_6: ; %atomicrmw.phi +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB124_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -15422,39 +19174,77 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB128_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB124_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB124_6 +; GCN3-NEXT: .LBB124_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB124_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB124_6: ; %atomicrmw.phi +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw min ptr %gep, i64 %in seq_cst ret i64 %result } define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s11 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB125_3 +; GCN1-NEXT: ; %bb.1: ; %Flow6 +; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccnz .LBB125_6 +; GCN1-NEXT: .LBB125_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB125_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -15465,30 +19255,67 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB129_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB125_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_branch .LBB125_2 +; GCN1-NEXT: .LBB125_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB125_3 +; GCN2-NEXT: ; %bb.1: ; %Flow6 +; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccnz .LBB125_6 +; GCN2-NEXT: .LBB125_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB125_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -15499,164 +19326,318 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB129_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB125_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_branch .LBB125_2 +; GCN2-NEXT: .LBB125_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_mov_b32 s14, -1 +; GCN3-NEXT: s_mov_b32 s15, 0xe00000 +; GCN3-NEXT: s_add_u32 s12, s12, s11 +; GCN3-NEXT: s_addc_u32 s13, s13, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GCN3-NEXT: s_add_u32 s0, s0, s6 +; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_add_u32 s0, s0, 32 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: s_addc_u32 s1, s1, 0 +; GCN3-NEXT: s_cmp_eq_u32 s1, s5 +; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN3-NEXT: s_mov_b64 s[4:5], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB125_3 +; GCN3-NEXT: ; %bb.1: ; %Flow6 +; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN3-NEXT: s_cbranch_vccnz .LBB125_6 +; GCN3-NEXT: .LBB125_2: ; %atomicrmw.phi +; GCN3-NEXT: s_endpgm +; GCN3-NEXT: .LBB125_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v5, s1 ; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s3 +; GCN3-NEXT: v_mov_b32_e32 v7, s2 +; GCN3-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB129_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB125_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_branch .LBB125_2 +; GCN3-NEXT: .LBB125_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN3-NEXT: s_cselect_b32 s0, s0, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s2 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst ret void } define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s6 -; GCN1-NEXT: s_addc_u32 s1, s1, s7 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s5 -; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB126_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v3 -; GCN1-NEXT: v_mov_b32_e32 v8, v2 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB130_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_cbranch_execnz .LBB126_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_branch .LBB126_6 +; GCN1-NEXT: .LBB126_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB126_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen +; GCN1-NEXT: .LBB126_6: ; %atomicrmw.phi +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s6 -; GCN2-NEXT: s_addc_u32 s1, s1, s7 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s5 -; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB126_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v3 -; GCN2-NEXT: v_mov_b32_e32 v8, v2 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB130_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_cbranch_execnz .LBB126_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_branch .LBB126_6 +; GCN2-NEXT: .LBB126_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB126_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB126_6: ; %atomicrmw.phi +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: s_mov_b32 s18, -1 +; GCN3-NEXT: s_mov_b32 s19, 0xe00000 +; GCN3-NEXT: s_add_u32 s16, s16, s11 +; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN3-NEXT: s_addc_u32 s17, s17, 0 +; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s9 -; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN3-NEXT: s_add_u32 s0, s8, s0 +; GCN3-NEXT: s_addc_u32 s1, s9, s1 +; GCN3-NEXT: s_add_u32 s0, s0, 32 +; GCN3-NEXT: s_addc_u32 s1, s1, 0 +; GCN3-NEXT: s_cmp_eq_u32 s1, s3 +; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN3-NEXT: s_cbranch_vccz .LBB126_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s13 +; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v3 -; GCN3-NEXT: v_mov_b32_e32 v8, v2 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9] +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB130_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_cbranch_execnz .LBB126_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_branch .LBB126_6 +; GCN3-NEXT: .LBB126_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB126_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN3-NEXT: s_cselect_b32 s0, s0, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s12 +; GCN3-NEXT: v_mov_b32_e32 v3, s13 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4 +; GCN3-NEXT: .LBB126_6: ; %atomicrmw.phi +; GCN3-NEXT: v_mov_b32_e32 v2, s10 +; GCN3-NEXT: v_mov_b32_e32 v3, s11 +; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst store i64 %tmp0, ptr %out2 ret void } @@ -15664,17 +19645,33 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s11 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s4 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB127_3 +; GCN1-NEXT: ; %bb.1: ; %Flow5 +; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccnz .LBB127_6 +; GCN1-NEXT: .LBB127_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB127_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: .LBB131_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -15688,23 +19685,58 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB131_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB127_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_branch .LBB127_2 +; GCN1-NEXT: .LBB127_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s4 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB127_3 +; GCN2-NEXT: ; %bb.1: ; %Flow5 +; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccnz .LBB127_6 +; GCN2-NEXT: .LBB127_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB127_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: .LBB131_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -15718,26 +19750,60 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB131_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB127_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_branch .LBB127_2 +; GCN2-NEXT: .LBB127_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN3-NEXT: s_mov_b32 s14, -1 +; GCN3-NEXT: s_mov_b32 s15, 0xe00000 +; GCN3-NEXT: s_add_u32 s12, s12, s11 +; GCN3-NEXT: s_addc_u32 s13, s13, 0 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB131_1: ; %atomicrmw.start +; GCN3-NEXT: s_cmp_eq_u32 s1, s5 +; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN3-NEXT: s_mov_b64 s[4:5], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB127_3 +; GCN3-NEXT: ; %bb.1: ; %Flow5 +; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN3-NEXT: s_cbranch_vccnz .LBB127_6 +; GCN3-NEXT: .LBB127_2: ; %atomicrmw.phi +; GCN3-NEXT: s_endpgm +; GCN3-NEXT: .LBB127_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s3 +; GCN3-NEXT: v_mov_b32_e32 v7, s2 +; GCN3-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -15745,125 +19811,239 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB131_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB127_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_branch .LBB127_2 +; GCN3-NEXT: .LBB127_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN3-NEXT: s_cselect_b32 s0, s0, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s2 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4 ; GCN3-NEXT: s_endpgm entry: - %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst ret void } define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s11 +; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s6 -; GCN1-NEXT: s_addc_u32 s1, s1, s7 -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s5 -; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB132_1: ; %atomicrmw.start +; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN1-NEXT: s_add_u32 s0, s8, s0 +; GCN1-NEXT: s_addc_u32 s1, s9, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB128_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v3 -; GCN1-NEXT: v_mov_b32_e32 v8, v2 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB132_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_cbranch_execnz .LBB128_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_branch .LBB128_6 +; GCN1-NEXT: .LBB128_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB128_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s12 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s13 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen +; GCN1-NEXT: .LBB128_6: ; %atomicrmw.phi +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s11 +; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s6 -; GCN2-NEXT: s_addc_u32 s1, s1, s7 -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s5 -; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB132_1: ; %atomicrmw.start +; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN2-NEXT: s_add_u32 s0, s8, s0 +; GCN2-NEXT: s_addc_u32 s1, s9, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB128_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v3 -; GCN2-NEXT: v_mov_b32_e32 v8, v2 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB132_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_cbranch_execnz .LBB128_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_branch .LBB128_6 +; GCN2-NEXT: .LBB128_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB128_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s12 +; GCN2-NEXT: v_mov_b32_e32 v4, s13 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB128_6: ; %atomicrmw.phi +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN3-NEXT: s_mov_b32 s18, -1 +; GCN3-NEXT: s_mov_b32 s19, 0xe00000 +; GCN3-NEXT: s_add_u32 s16, s16, s11 +; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GCN3-NEXT: s_addc_u32 s17, s17, 0 +; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s9 -; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB132_1: ; %atomicrmw.start +; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GCN3-NEXT: s_add_u32 s0, s8, s0 +; GCN3-NEXT: s_addc_u32 s1, s9, s1 +; GCN3-NEXT: s_cmp_eq_u32 s1, s3 +; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN3-NEXT: s_cbranch_vccz .LBB128_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s13 +; GCN3-NEXT: v_mov_b32_e32 v5, s12 +; GCN3-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v3 -; GCN3-NEXT: v_mov_b32_e32 v8, v2 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9] +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB132_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_cbranch_execnz .LBB128_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_branch .LBB128_6 +; GCN3-NEXT: .LBB128_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB128_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN3-NEXT: s_cselect_b32 s0, s0, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s12 +; GCN3-NEXT: v_mov_b32_e32 v3, s13 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4 +; GCN3-NEXT: .LBB128_6: ; %atomicrmw.phi +; GCN3-NEXT: v_mov_b32_e32 v2, s10 +; GCN3-NEXT: v_mov_b32_e32 v3, s11 +; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst store i64 %tmp0, ptr %out2 ret void } @@ -15872,86 +20052,181 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB133_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB129_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB129_6 +; GCN1-NEXT: .LBB129_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB129_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB129_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB133_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB129_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB129_2 +; GCN1-NEXT: .LBB129_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[0:1] -; GCN2-NEXT: flat_load_dword v6, v[8:9] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB133_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB129_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB129_6 +; GCN2-NEXT: .LBB129_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB129_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB129_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB133_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB129_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB129_2 +; GCN2-NEXT: .LBB129_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB133_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB129_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB129_6 +; GCN3-NEXT: .LBB129_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB129_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB129_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB133_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB129_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB129_2 +; GCN3-NEXT: .LBB129_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -15959,14 +20234,29 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB130_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB130_6 +; GCN1-NEXT: .LBB130_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB130_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB134_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB130_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -15978,24 +20268,57 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB134_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB130_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB130_2 +; GCN1-NEXT: .LBB130_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB130_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB130_6 +; GCN2-NEXT: .LBB130_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB130_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB134_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB130_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -16007,40 +20330,88 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB134_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB130_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB130_2 +; GCN2-NEXT: .LBB130_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB134_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB130_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB130_6 +; GCN3-NEXT: .LBB130_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB130_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB130_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB134_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB130_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB130_2 +; GCN3-NEXT: .LBB130_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -16052,27 +20423,129 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB131_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB131_4 +; GCN1-NEXT: .LBB131_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB131_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB131_2 +; GCN1-NEXT: .LBB131_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc +; GCN1-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB131_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB131_4 +; GCN2-NEXT: .LBB131_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB131_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB131_2 +; GCN2-NEXT: .LBB131_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc +; GCN2-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB131_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB131_4 +; GCN3-NEXT: .LBB131_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB131_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB131_2 +; GCN3-NEXT: .LBB131_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst ret void } @@ -16080,32 +20553,136 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB132_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB132_4 +; GCN1-NEXT: .LBB132_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB132_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB132_2 +; GCN1-NEXT: .LBB132_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc +; GCN1-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB132_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB132_4 +; GCN2-NEXT: .LBB132_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB132_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB132_2 +; GCN2-NEXT: .LBB132_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc +; GCN2-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB132_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB132_4 +; GCN3-NEXT: .LBB132_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB132_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB132_2 +; GCN3-NEXT: .LBB132_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst ret void } @@ -16113,27 +20690,138 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB133_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB133_4 +; GCN1-NEXT: .LBB133_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB133_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB133_2 +; GCN1-NEXT: .LBB133_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc +; GCN1-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB133_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB133_4 +; GCN2-NEXT: .LBB133_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB133_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB133_2 +; GCN2-NEXT: .LBB133_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc +; GCN2-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB133_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB133_4 +; GCN3-NEXT: .LBB133_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB133_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB133_2 +; GCN3-NEXT: .LBB133_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst ret i64 %result } @@ -16141,32 +20829,139 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB134_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB134_4 +; GCN1-NEXT: .LBB134_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB134_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB134_2 +; GCN1-NEXT: .LBB134_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc +; GCN1-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB134_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB134_4 +; GCN2-NEXT: .LBB134_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB134_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB134_2 +; GCN2-NEXT: .LBB134_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc +; GCN2-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB134_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB134_4 +; GCN3-NEXT: .LBB134_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB134_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB134_2 +; GCN3-NEXT: .LBB134_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst ret i64 %result } @@ -16174,39 +20969,135 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB135_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB135_4 +; GCN1-NEXT: .LBB135_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB135_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB135_2 +; GCN1-NEXT: .LBB135_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB135_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB135_4 +; GCN2-NEXT: .LBB135_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB135_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB135_2 +; GCN2-NEXT: .LBB135_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB135_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB135_4 +; GCN3-NEXT: .LBB135_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB135_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB135_2 +; GCN3-NEXT: .LBB135_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst ret void } @@ -16214,44 +21105,142 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB136_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB136_4 +; GCN1-NEXT: .LBB136_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB136_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB136_2 +; GCN1-NEXT: .LBB136_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB136_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB136_4 +; GCN2-NEXT: .LBB136_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB136_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB136_2 +; GCN2-NEXT: .LBB136_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB136_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB136_4 +; GCN3-NEXT: .LBB136_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB136_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB136_2 +; GCN3-NEXT: .LBB136_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst ret void } @@ -16259,39 +21248,129 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB137_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB137_3 +; GCN1-NEXT: s_branch .LBB137_4 +; GCN1-NEXT: .LBB137_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB137_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB137_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB137_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB137_3 +; GCN2-NEXT: s_branch .LBB137_4 +; GCN2-NEXT: .LBB137_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB137_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB137_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB137_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB137_3 +; GCN3-NEXT: s_branch .LBB137_4 +; GCN3-NEXT: .LBB137_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB137_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB137_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst ret i64 %result } @@ -16299,44 +21378,136 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB138_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB138_3 +; GCN1-NEXT: s_branch .LBB138_4 +; GCN1-NEXT: .LBB138_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB138_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB138_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB138_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB138_3 +; GCN2-NEXT: s_branch .LBB138_4 +; GCN2-NEXT: .LBB138_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB138_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB138_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB138_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB138_3 +; GCN3-NEXT: s_branch .LBB138_4 +; GCN3-NEXT: .LBB138_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB138_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB138_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst ret i64 %result } @@ -16344,32 +21515,136 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB139_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB139_4 +; GCN1-NEXT: .LBB139_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB139_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB139_2 +; GCN1-NEXT: .LBB139_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc +; GCN1-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB139_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB139_4 +; GCN2-NEXT: .LBB139_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB139_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB139_2 +; GCN2-NEXT: .LBB139_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc +; GCN2-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB139_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB139_4 +; GCN3-NEXT: .LBB139_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB139_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB139_2 +; GCN3-NEXT: .LBB139_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -16377,32 +21652,139 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB140_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB140_4 +; GCN1-NEXT: .LBB140_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB140_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB140_2 +; GCN1-NEXT: .LBB140_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc +; GCN1-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB140_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB140_4 +; GCN2-NEXT: .LBB140_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB140_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB140_2 +; GCN2-NEXT: .LBB140_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc +; GCN2-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB140_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB140_4 +; GCN3-NEXT: .LBB140_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB140_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB140_2 +; GCN3-NEXT: .LBB140_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -16414,27 +21796,132 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB141_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB141_4 +; GCN1-NEXT: .LBB141_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB141_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB141_2 +; GCN1-NEXT: .LBB141_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB141_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB141_4 +; GCN2-NEXT: .LBB141_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB141_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB141_2 +; GCN2-NEXT: .LBB141_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB141_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB141_4 +; GCN3-NEXT: .LBB141_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB141_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB141_2 +; GCN3-NEXT: .LBB141_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v0 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v1, s[6:7] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst ret void } @@ -16442,32 +21929,139 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB142_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB142_4 +; GCN1-NEXT: .LBB142_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB142_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB142_2 +; GCN1-NEXT: .LBB142_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB142_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB142_4 +; GCN2-NEXT: .LBB142_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB142_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB142_2 +; GCN2-NEXT: .LBB142_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB142_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB142_4 +; GCN3-NEXT: .LBB142_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB142_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB142_2 +; GCN3-NEXT: .LBB142_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v0 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v1, s[6:7] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst ret void } @@ -16475,27 +22069,144 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB143_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB143_4 +; GCN1-NEXT: .LBB143_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB143_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB143_2 +; GCN1-NEXT: .LBB143_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB143_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB143_4 +; GCN2-NEXT: .LBB143_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB143_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB143_2 +; GCN2-NEXT: .LBB143_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB143_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB143_4 +; GCN3-NEXT: .LBB143_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB143_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB143_2 +; GCN3-NEXT: .LBB143_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e64 v5, s[6:7], -1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN3-NEXT: v_addc_co_u32_e64 v6, s[6:7], -1, v1, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst ret i64 %result } @@ -16503,32 +22214,145 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB144_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB144_4 +; GCN1-NEXT: .LBB144_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB144_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB144_2 +; GCN1-NEXT: .LBB144_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB144_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB144_4 +; GCN2-NEXT: .LBB144_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB144_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB144_2 +; GCN2-NEXT: .LBB144_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB144_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB144_4 +; GCN3-NEXT: .LBB144_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB144_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB144_2 +; GCN3-NEXT: .LBB144_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e64 v5, s[6:7], -1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN3-NEXT: v_addc_co_u32_e64 v6, s[6:7], -1, v1, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst ret i64 %result } @@ -16536,39 +22360,144 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB145_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB145_4 +; GCN1-NEXT: .LBB145_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB145_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB145_2 +; GCN1-NEXT: .LBB145_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] +; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB145_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB145_4 +; GCN2-NEXT: .LBB145_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB145_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB145_2 +; GCN2-NEXT: .LBB145_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] +; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB145_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB145_4 +; GCN3-NEXT: .LBB145_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB145_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB145_2 +; GCN3-NEXT: .LBB145_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v0 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v1, s[36:37] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst ret void } @@ -16576,44 +22505,151 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB146_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB146_4 +; GCN1-NEXT: .LBB146_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB146_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB146_2 +; GCN1-NEXT: .LBB146_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] +; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB146_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB146_4 +; GCN2-NEXT: .LBB146_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB146_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB146_2 +; GCN2-NEXT: .LBB146_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] +; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB146_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB146_4 +; GCN3-NEXT: .LBB146_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB146_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB146_2 +; GCN3-NEXT: .LBB146_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v0 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v1, s[36:37] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst ret void } @@ -16621,39 +22657,141 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB147_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB147_3 +; GCN1-NEXT: s_branch .LBB147_4 +; GCN1-NEXT: .LBB147_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB147_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[36:37], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] +; GCN1-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB147_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB147_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB147_3 +; GCN2-NEXT: s_branch .LBB147_4 +; GCN2-NEXT: .LBB147_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB147_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[36:37], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] +; GCN2-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB147_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB147_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB147_3 +; GCN3-NEXT: s_branch .LBB147_4 +; GCN3-NEXT: .LBB147_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB147_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e64 v5, s[36:37], -1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] +; GCN3-NEXT: v_addc_co_u32_e64 v6, s[36:37], -1, v1, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB147_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst ret i64 %result } @@ -16661,44 +22799,148 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB148_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB148_3 +; GCN1-NEXT: s_branch .LBB148_4 +; GCN1-NEXT: .LBB148_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB148_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[36:37], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] +; GCN1-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB148_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB148_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB148_3 +; GCN2-NEXT: s_branch .LBB148_4 +; GCN2-NEXT: .LBB148_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB148_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[36:37], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] +; GCN2-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB148_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB148_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB148_3 +; GCN3-NEXT: s_branch .LBB148_4 +; GCN3-NEXT: .LBB148_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB148_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e64 v5, s[36:37], -1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] +; GCN3-NEXT: v_addc_co_u32_e64 v6, s[36:37], -1, v1, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB148_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst ret i64 %result } @@ -16706,32 +22948,139 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB149_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB149_4 +; GCN1-NEXT: .LBB149_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB149_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB149_2 +; GCN1-NEXT: .LBB149_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB149_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB149_4 +; GCN2-NEXT: .LBB149_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB149_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB149_2 +; GCN2-NEXT: .LBB149_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB149_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB149_4 +; GCN3-NEXT: .LBB149_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB149_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB149_2 +; GCN3-NEXT: .LBB149_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v0 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v1, s[6:7] +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -16739,34 +23088,146 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB150_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB150_4 +; GCN1-NEXT: .LBB150_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB150_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB150_2 +; GCN1-NEXT: .LBB150_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB150_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB150_4 +; GCN2-NEXT: .LBB150_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB150_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB150_2 +; GCN2-NEXT: .LBB150_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB150_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB150_4 +; GCN3-NEXT: .LBB150_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB150_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB150_2 +; GCN3-NEXT: .LBB150_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e64 v5, s[6:7], -1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GCN3-NEXT: v_addc_co_u32_e64 v6, s[6:7], -1, v1, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 ret i64 %result } !0 = !{} -!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index edd5620dc41128..604fc732e7e1cd 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -4258,10 +4258,10 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_max_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 @@ -4292,10 +4292,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GFX8-LABEL: atomic_max_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 @@ -4326,22 +4326,22 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GFX9-LABEL: atomic_max_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc @@ -4365,7 +4365,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s6 @@ -4402,7 +4402,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s6 @@ -4439,23 +4439,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc @@ -4467,8 +4467,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX9-NEXT: s_cbranch_execnz .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: @@ -4482,10 +4482,10 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_max_i64_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s1 @@ -4514,10 +4514,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GFX8-LABEL: atomic_max_i64_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -4546,22 +4546,22 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GFX9-LABEL: atomic_max_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -4584,7 +4584,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_max_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s6 @@ -4619,7 +4619,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_max_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s6 @@ -4654,23 +4654,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc @@ -4682,8 +4682,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: @@ -5640,10 +5640,10 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umax_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 @@ -5674,10 +5674,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GFX8-LABEL: atomic_umax_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 @@ -5708,22 +5708,22 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GFX9-LABEL: atomic_umax_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc @@ -5747,7 +5747,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s6 @@ -5784,7 +5784,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s6 @@ -5821,23 +5821,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc @@ -5849,8 +5849,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX9-NEXT: s_cbranch_execnz .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: @@ -5864,7 +5864,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_umax_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s6 @@ -5899,7 +5899,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s6 @@ -5934,23 +5934,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc @@ -5962,8 +5962,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX9-NEXT: s_cbranch_execnz .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: @@ -7864,10 +7864,10 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_min_i64_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 @@ -7898,10 +7898,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GFX8-LABEL: atomic_min_i64_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 @@ -7932,22 +7932,22 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GFX9-LABEL: atomic_min_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc @@ -7971,7 +7971,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s6 @@ -8008,7 +8008,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s6 @@ -8045,23 +8045,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc @@ -8073,8 +8073,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX9-NEXT: s_cbranch_execnz .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: @@ -8088,7 +8088,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_min_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8118,7 +8118,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GFX8-LABEL: atomic_min_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -8148,20 +8148,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GFX9-LABEL: atomic_min_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -8169,9 +8169,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB127_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -8183,7 +8183,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GFX7-LABEL: atomic_min_i64_ret_addr64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s6 @@ -8218,7 +8218,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX8-LABEL: atomic_min_i64_ret_addr64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s6 @@ -8253,23 +8253,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9] +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc @@ -8281,8 +8281,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index c75521267ae7ce..a96d022b66f127 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -21,7 +21,7 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0 define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -41,7 +41,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 @@ -75,7 +75,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_0_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -101,7 +101,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 @@ -146,7 +146,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -166,7 +166,7 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 @@ -200,7 +200,7 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -220,7 +220,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 @@ -254,7 +254,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_f64_0_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -280,7 +280,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 @@ -325,7 +325,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -345,7 +345,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 @@ -379,7 +379,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_1_f64_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -405,7 +405,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 @@ -450,7 +450,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -470,7 +470,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 @@ -506,7 +506,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -532,7 +532,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 @@ -579,7 +579,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -605,7 +605,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 @@ -652,7 +652,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s6, 0 ; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -678,7 +678,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s6, 0 ; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -703,7 +703,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 @@ -727,7 +727,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 @@ -774,7 +774,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s6, 0 ; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -800,7 +800,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s6, 0 ; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -825,7 +825,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 @@ -899,81 +899,81 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s6 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s6 -; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s7 +; SI-NOFMA-NEXT: s_mov_b32 s10, s6 +; SI-NOFMA-NEXT: s_mov_b32 s11, s7 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: s_mov_b32 s4, s0 +; SI-NOFMA-NEXT: s_mov_b32 s5, s1 ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_add_x_one_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s6 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s6 -; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s7 +; SI-FMA-NEXT: s_mov_b32 s10, s6 +; SI-FMA-NEXT: s_mov_b32 s11, s7 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: s_mov_b32 s4, s0 +; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -988,81 +988,81 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s6 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s6 -; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s7 +; SI-NOFMA-NEXT: s_mov_b32 s10, s6 +; SI-NOFMA-NEXT: s_mov_b32 s11, s7 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: s_mov_b32 s4, s0 +; SI-NOFMA-NEXT: s_mov_b32 s5, s1 ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_add_x_one: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s6 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s6 -; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s7 +; SI-FMA-NEXT: s_mov_b32 s10, s6 +; SI-FMA-NEXT: s_mov_b32 s11, s7 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: s_mov_b32 s4, s0 +; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1077,81 +1077,81 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s6 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s6 -; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s7 +; SI-NOFMA-NEXT: s_mov_b32 s10, s6 +; SI-NOFMA-NEXT: s_mov_b32 s11, s7 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s4, s0 +; SI-NOFMA-NEXT: s_mov_b32 s5, s1 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_add_x_negone_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s6 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s6 -; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s7 +; SI-FMA-NEXT: s_mov_b32 s10, s6 +; SI-FMA-NEXT: s_mov_b32 s11, s7 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s4, s0 +; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) ; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1166,81 +1166,81 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s6 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s6 -; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s7 +; SI-NOFMA-NEXT: s_mov_b32 s10, s6 +; SI-NOFMA-NEXT: s_mov_b32 s11, s7 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s4, s0 +; SI-NOFMA-NEXT: s_mov_b32 s5, s1 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_add_x_negone: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s6 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s6 -; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s7 +; SI-FMA-NEXT: s_mov_b32 s10, s6 +; SI-FMA-NEXT: s_mov_b32 s11, s7 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s4, s0 +; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) ; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1255,81 +1255,81 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s6 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s6 -; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s7 +; SI-NOFMA-NEXT: s_mov_b32 s10, s6 +; SI-NOFMA-NEXT: s_mov_b32 s11, s7 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s4, s0 +; SI-NOFMA-NEXT: s_mov_b32 s5, s1 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_one_x_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s6 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s6 -; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s7 +; SI-FMA-NEXT: s_mov_b32 s10, s6 +; SI-FMA-NEXT: s_mov_b32 s11, s7 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s4, s0 +; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) ; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1 ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1344,81 +1344,81 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s6 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s6 -; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s7 +; SI-NOFMA-NEXT: s_mov_b32 s10, s6 +; SI-NOFMA-NEXT: s_mov_b32 s11, s7 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s4, s0 +; SI-NOFMA-NEXT: s_mov_b32 s5, s1 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_one_x: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s6 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s6 -; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s7 +; SI-FMA-NEXT: s_mov_b32 s10, s6 +; SI-FMA-NEXT: s_mov_b32 s11, s7 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s4, s0 +; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) ; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1 ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1433,81 +1433,81 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s6 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s6 -; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s7 +; SI-NOFMA-NEXT: s_mov_b32 s10, s6 +; SI-NOFMA-NEXT: s_mov_b32 s11, s7 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s4, s0 +; SI-NOFMA-NEXT: s_mov_b32 s5, s1 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s6 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s6 -; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s7 +; SI-FMA-NEXT: s_mov_b32 s10, s6 +; SI-FMA-NEXT: s_mov_b32 s11, s7 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s4, s0 +; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) ; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1 ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1522,81 +1522,81 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s6 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s6 -; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s7 +; SI-NOFMA-NEXT: s_mov_b32 s10, s6 +; SI-NOFMA-NEXT: s_mov_b32 s11, s7 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s4, s0 +; SI-NOFMA-NEXT: s_mov_b32 s5, s1 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s6 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s6 -; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s7 +; SI-FMA-NEXT: s_mov_b32 s10, s6 +; SI-FMA-NEXT: s_mov_b32 s11, s7 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s4, s0 +; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) ; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1 ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1611,81 +1611,81 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s6 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s6 -; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s7 +; SI-NOFMA-NEXT: s_mov_b32 s10, s6 +; SI-NOFMA-NEXT: s_mov_b32 s11, s7 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s4, s0 +; SI-NOFMA-NEXT: s_mov_b32 s5, s1 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_x_one_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s6 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s6 -; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s7 +; SI-FMA-NEXT: s_mov_b32 s10, s6 +; SI-FMA-NEXT: s_mov_b32 s11, s7 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s4, s0 +; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) ; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1700,81 +1700,81 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s6 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s6 -; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s7 +; SI-NOFMA-NEXT: s_mov_b32 s10, s6 +; SI-NOFMA-NEXT: s_mov_b32 s11, s7 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s4, s0 +; SI-NOFMA-NEXT: s_mov_b32 s5, s1 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_x_one: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s6 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s6 -; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s7 +; SI-FMA-NEXT: s_mov_b32 s10, s6 +; SI-FMA-NEXT: s_mov_b32 s11, s7 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s4, s0 +; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) ; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1789,81 +1789,81 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s6 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s6 -; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s7 +; SI-NOFMA-NEXT: s_mov_b32 s10, s6 +; SI-NOFMA-NEXT: s_mov_b32 s11, s7 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s4, s0 +; SI-NOFMA-NEXT: s_mov_b32 s5, s1 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s6 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s6 -; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s7 +; SI-FMA-NEXT: s_mov_b32 s10, s6 +; SI-FMA-NEXT: s_mov_b32 s11, s7 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s4, s0 +; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) ; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1878,81 +1878,81 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s6, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s6 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s6 -; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s12, s2 +; SI-NOFMA-NEXT: s_mov_b32 s13, s3 +; SI-NOFMA-NEXT: s_mov_b32 s15, s7 +; SI-NOFMA-NEXT: s_mov_b32 s10, s6 +; SI-NOFMA-NEXT: s_mov_b32 s11, s7 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s4, s0 +; SI-NOFMA-NEXT: s_mov_b32 s5, s1 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s6, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s6 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s6 -; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s15, s7 +; SI-FMA-NEXT: s_mov_b32 s10, s6 +; SI-FMA-NEXT: s_mov_b32 s11, s7 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s4, s0 +; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) ; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { @@ -1971,7 +1971,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_interp: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s10, -1 ; SI-NOFMA-NEXT: s_mov_b32 s14, s10 @@ -2003,7 +2003,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; SI-FMA-LABEL: test_f32_interp: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 ; SI-FMA-NEXT: s_mov_b32 s18, s10 @@ -2033,7 +2033,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; GFX11-NOFMA-LABEL: test_f32_interp: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x2 @@ -2052,7 +2052,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; GFX11-FMA-LABEL: test_f32_interp: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x2 @@ -2083,7 +2083,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; SI-FMA-LABEL: test_f64_interp: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 ; SI-FMA-NEXT: s_mov_b32 s18, s10 @@ -2113,7 +2113,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; ; GFX11-NOFMA-LABEL: test_f64_interp: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x2 @@ -2132,7 +2132,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; ; GFX11-FMA-LABEL: test_f64_interp: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x2 @@ -2164,7 +2164,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: fma_neg_2.0_neg_a_b_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2180,7 +2180,7 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2210,7 +2210,7 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: fma_2.0_neg_a_b_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2226,7 +2226,7 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: fma_2.0_neg_a_b_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2256,7 +2256,7 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 { ; SI-LABEL: fma_neg_b_c_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v12, 4, v0 @@ -2277,7 +2277,7 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: fma_neg_b_c_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll index 39a9a85081af59..a10856e36ea82b 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.ll @@ -159,15 +159,15 @@ define float @fold_fmul_distributive(float %x, float %y) { define amdgpu_kernel void @vec_mul_scalar_add_fma(<2 x float> %a, <2 x float> %b, float %c1, ptr addrspace(1) %inptr) { ; GFX906-LABEL: vec_mul_scalar_add_fma: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX906-NEXT: s_load_dword s8, s[4:5], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, s8 -; GFX906-NEXT: v_mov_b32_e32 v2, s6 -; GFX906-NEXT: v_fmac_f32_e32 v1, s4, v2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX906-NEXT: v_mov_b32_e32 v2, s2 +; GFX906-NEXT: v_fmac_f32_e32 v1, s0, v2 +; GFX906-NEXT: global_store_dword v0, v1, s[6:7] offset:4 ; GFX906-NEXT: s_endpgm %gep = getelementptr float, ptr addrspace(1) %inptr, i32 1 %c = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 5caaa2c9550f9a..4b3f0dbbaea984 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -37,7 +37,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -67,37 +67,37 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s16, s8 -; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 ; GFX9-NEXT: s_mov_b32 s18, s2 ; GFX9-NEXT: s_mov_b32 s19, s3 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc +; GFX9-NEXT: buffer_load_dword v2, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmax3_olt_0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -137,7 +137,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -167,7 +167,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -197,37 +197,37 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s16, s8 -; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 ; GFX9-NEXT: s_mov_b32 s18, s2 ; GFX9-NEXT: s_mov_b32 s19, s3 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc +; GFX9-NEXT: buffer_load_dword v2, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: v_max3_f32 v0, v2, v0, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmax3_olt_1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -266,7 +266,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -300,7 +300,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -334,37 +334,37 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s16, s8 -; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 ; GFX9-NEXT: s_mov_b32 s18, s2 ; GFX9-NEXT: s_mov_b32 s19, s3 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc +; GFX9-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmax3_olt_0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -404,7 +404,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -438,7 +438,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -472,37 +472,37 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s16, s8 -; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 ; GFX9-NEXT: s_mov_b32 s18, s2 ; GFX9-NEXT: s_mov_b32 s19, s3 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc +; GFX9-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: v_max3_f16 v0, v2, v0, v1 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmax3_olt_1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll index 018399983a863d..c9b61e40c79e00 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_uge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -59,7 +59,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -80,7 +80,7 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_oge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -111,7 +111,7 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -132,7 +132,7 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_ugt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -163,7 +163,7 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmax_legacy_ogt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -184,7 +184,7 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_ogt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll index 84c3913ec93c56..11b7c51ef2d516 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll @@ -263,16 +263,16 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fmaximumi_f32_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS +; GCN-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS +; GCN-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_maximum_f32 v1, v1, v2 -; GCN-NEXT: global_store_b32 v0, v1, s[4:5] +; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -285,16 +285,16 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fmaximum_f16_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS +; GCN-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_u16 v2, v0, s[0:1] scope:SCOPE_SYS +; GCN-NEXT: global_load_u16 v2, v0, s[4:5] scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_maximum_f16 v1, v1, v2 -; GCN-NEXT: global_store_b16 v0, v1, s[4:5] +; GCN-NEXT: global_store_b16 v0, v1, s[0:1] ; GCN-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 4 %b = load volatile half, ptr addrspace(1) %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 58e864b496b374..1cdcf276c526be 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -28,7 +28,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -45,7 +45,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -63,7 +63,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -83,19 +83,19 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -121,7 +121,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -138,7 +138,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -173,7 +173,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -193,19 +193,19 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -232,7 +232,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -249,7 +249,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -266,7 +266,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -284,7 +284,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -304,19 +304,19 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -343,7 +343,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -360,7 +360,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -377,7 +377,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -395,7 +395,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -415,19 +415,19 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -454,7 +454,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -472,7 +472,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -490,7 +490,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -509,7 +509,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -530,20 +530,20 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 4.0, v1 ; GFX9-NEXT: v_min_f32_e32 v1, 2.0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -570,7 +570,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -591,7 +591,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -612,7 +612,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -634,7 +634,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -658,39 +658,39 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX9-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1 ; GFX9-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v1 ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -709,7 +709,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX11-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -741,7 +741,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -759,7 +759,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_fmed3_r_i_i_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -777,7 +777,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -796,7 +796,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_fmed3_r_i_i_f64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -817,20 +817,20 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_fmed3_r_i_i_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_fmed3_r_i_i_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -859,7 +859,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -875,7 +875,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -891,7 +891,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -908,7 +908,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -927,18 +927,18 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -963,7 +963,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -980,7 +980,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -998,7 +998,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1039,34 +1039,34 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX9-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-GISEL-NEXT: v_cmp_nlt_f32_e32 vcc, 2.0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc ; GFX9-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc, 4.0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1080,7 +1080,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX11-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1116,7 +1116,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1140,7 +1140,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1163,7 +1163,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1222,38 +1222,38 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1270,7 +1270,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1306,7 +1306,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1330,7 +1330,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1353,7 +1353,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1380,7 +1380,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1412,38 +1412,38 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, -v2, v3 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2 ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1460,7 +1460,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1496,7 +1496,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1520,7 +1520,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1543,7 +1543,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1570,7 +1570,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1602,38 +1602,38 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, -v3 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3 ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1650,7 +1650,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1686,7 +1686,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1710,7 +1710,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1734,7 +1734,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1761,7 +1761,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1794,39 +1794,39 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3| -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1843,7 +1843,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1886,7 +1886,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1910,7 +1910,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1935,7 +1935,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1962,7 +1962,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1996,40 +1996,40 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3| -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| ; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| ; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2046,7 +2046,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2092,7 +2092,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2119,7 +2119,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2144,7 +2144,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2174,7 +2174,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2208,25 +2208,25 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 ; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2267,7 +2267,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2291,7 +2291,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; SI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2313,7 +2313,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; VI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2340,7 +2340,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; VI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2371,22 +2371,22 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_input_calls_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2419,7 +2419,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2443,7 +2443,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_nnan_call_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2465,7 +2465,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2492,7 +2492,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_nnan_call_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2523,22 +2523,22 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_nnan_call_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_call_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2571,7 +2571,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2595,7 +2595,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_fast_call_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2617,7 +2617,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_fast_call_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2644,7 +2644,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_fast_call_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2675,22 +2675,22 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_fast_call_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_fast_call_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2735,7 +2735,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2759,7 +2759,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2781,7 +2781,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2808,7 +2808,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2839,22 +2839,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2887,7 +2887,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2911,7 +2911,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2933,7 +2933,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2960,7 +2960,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2991,22 +2991,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3039,7 +3039,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3063,7 +3063,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3086,7 +3086,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3113,7 +3113,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3145,38 +3145,38 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3193,7 +3193,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3229,7 +3229,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3253,7 +3253,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3275,7 +3275,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3302,7 +3302,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3333,22 +3333,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3381,7 +3381,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3405,7 +3405,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3427,7 +3427,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3454,7 +3454,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3485,22 +3485,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3533,7 +3533,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3557,7 +3557,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3579,7 +3579,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3606,7 +3606,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3637,22 +3637,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3685,7 +3685,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3709,7 +3709,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3731,7 +3731,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3758,7 +3758,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3789,22 +3789,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3837,7 +3837,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3861,7 +3861,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3883,7 +3883,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3910,7 +3910,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3941,22 +3941,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3989,7 +3989,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4013,7 +4013,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4035,7 +4035,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4062,7 +4062,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4093,22 +4093,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4141,7 +4141,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4165,7 +4165,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4187,7 +4187,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4214,7 +4214,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4245,22 +4245,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4293,7 +4293,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4317,7 +4317,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4339,7 +4339,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4366,7 +4366,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4397,22 +4397,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4445,7 +4445,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4469,7 +4469,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4491,7 +4491,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4518,7 +4518,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4549,22 +4549,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4597,7 +4597,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4621,7 +4621,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4643,7 +4643,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4670,7 +4670,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4701,22 +4701,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4749,7 +4749,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4773,7 +4773,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4795,7 +4795,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4822,7 +4822,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4853,22 +4853,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4901,7 +4901,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4925,7 +4925,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4947,7 +4947,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4974,7 +4974,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5005,22 +5005,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5053,7 +5053,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5077,7 +5077,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5099,7 +5099,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5126,7 +5126,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5157,22 +5157,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5205,7 +5205,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5229,7 +5229,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5251,7 +5251,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5278,7 +5278,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5309,22 +5309,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5360,7 +5360,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5384,7 +5384,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5406,7 +5406,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5433,7 +5433,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5464,22 +5464,22 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5516,7 +5516,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5549,7 +5549,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5581,7 +5581,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5616,7 +5616,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5655,14 +5655,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 @@ -5673,12 +5673,12 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5718,7 +5718,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5751,7 +5751,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5783,7 +5783,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5818,7 +5818,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5857,14 +5857,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 @@ -5875,12 +5875,12 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5905,7 +5905,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5945,7 +5945,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5978,7 +5978,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6010,7 +6010,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6045,7 +6045,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6084,14 +6084,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 @@ -6102,12 +6102,12 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6147,7 +6147,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6176,7 +6176,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6204,7 +6204,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6237,7 +6237,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6274,14 +6274,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 @@ -6290,12 +6290,12 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; GFX9-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6316,7 +6316,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6353,7 +6353,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6380,7 +6380,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6405,7 +6405,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6435,7 +6435,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6469,25 +6469,25 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 ; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6528,7 +6528,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6555,7 +6555,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6580,7 +6580,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6610,7 +6610,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6644,25 +6644,25 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 ; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6703,7 +6703,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6730,7 +6730,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6755,7 +6755,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6785,7 +6785,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6819,25 +6819,25 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 ; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6878,7 +6878,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6902,7 +6902,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; SI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6925,7 +6925,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6952,7 +6952,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6984,38 +6984,38 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7032,7 +7032,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7068,7 +7068,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7094,7 +7094,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7120,7 +7120,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7150,7 +7150,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7185,44 +7185,44 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_min_f32_e64 v4, -v1, v2 ; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX9-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1 ; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX9-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 ; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7241,7 +7241,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7280,7 +7280,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_min_max_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7305,7 +7305,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_global_nnans_min_max_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7328,7 +7328,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_global_nnans_min_max_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7356,7 +7356,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_global_nnans_min_max_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7388,23 +7388,23 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_global_nnans_min_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_global_nnans_min_max_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7435,7 +7435,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -7454,7 +7454,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -7481,7 +7481,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7500,7 +7500,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7521,19 +7521,19 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 ; GFX9-NEXT: v_med3_f16 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -7559,7 +7559,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -7590,7 +7590,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7637,7 +7637,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7670,7 +7670,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7707,25 +7707,25 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_ushort v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f16_e32 v2, 2.0, v2 ; GFX9-NEXT: v_add_f16_e32 v3, 4.0, v3 ; GFX9-NEXT: v_med3_f16 v1, v1, v2, v3 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -7767,7 +7767,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: two_non_inline_constant: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7785,7 +7785,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: two_non_inline_constant: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -7803,7 +7803,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: two_non_inline_constant: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7822,7 +7822,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: two_non_inline_constant: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7843,20 +7843,20 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: two_non_inline_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1 ; GFX9-NEXT: v_max_f32_e32 v1, 0x41000000, v1 ; GFX9-NEXT: v_min_f32_e32 v1, 0x41800000, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: two_non_inline_constant: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7872,7 +7872,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX11-GISEL-LABEL: two_non_inline_constant: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -7900,7 +7900,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: one_non_inline_constant: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7922,7 +7922,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: one_non_inline_constant: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -7944,7 +7944,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: one_non_inline_constant: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -7966,7 +7966,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: one_non_inline_constant: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -7990,23 +7990,23 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: one_non_inline_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v3, 0.5, v1 ; GFX9-NEXT: v_add_f32_e32 v1, 0x41800000, v1 ; GFX9-NEXT: v_med3_f32 v2, v3, 1.0, v2 -; GFX9-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: one_non_inline_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -8039,7 +8039,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: two_non_inline_constant_multi_use: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -8065,7 +8065,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: two_non_inline_constant_multi_use: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -8091,7 +8091,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: two_non_inline_constant_multi_use: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -8117,7 +8117,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: two_non_inline_constant_multi_use: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41000000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41800000 @@ -8145,18 +8145,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX9-SDAG-LABEL: two_non_inline_constant_multi_use: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x41000000 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x41000000 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 0.5, v1 ; GFX9-SDAG-NEXT: v_add_f32_e32 v4, 0x41800000, v1 ; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1 -; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s0, v2 -; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s2, v2 +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-SDAG-NEXT: global_store_dword v[0:1], v4, off ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: global_store_dword v[0:1], v1, off @@ -8165,18 +8165,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX9-GISEL-LABEL: two_non_inline_constant_multi_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_add_f32_e32 v4, 0.5, v1 ; GFX9-GISEL-NEXT: v_add_f32_e32 v5, 0x41800000, v1 ; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1 ; GFX9-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 -; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-GISEL-NEXT: global_store_dword v[0:1], v5, off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: global_store_dword v[0:1], v1, off @@ -8185,7 +8185,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX11-SDAG-LABEL: two_non_inline_constant_multi_use: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -8207,7 +8207,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX11-GISEL-LABEL: two_non_inline_constant_multi_use: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 714040405bf67b..38b712e044df93 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -37,7 +37,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -67,37 +67,37 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s16, s8 -; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 ; GFX9-NEXT: s_mov_b32 s18, s2 ; GFX9-NEXT: s_mov_b32 s19, s3 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc +; GFX9-NEXT: buffer_load_dword v2, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmin3_olt_0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -137,7 +137,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -167,7 +167,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -197,37 +197,37 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s16, s8 -; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 ; GFX9-NEXT: s_mov_b32 s18, s2 ; GFX9-NEXT: s_mov_b32 s19, s3 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc +; GFX9-NEXT: buffer_load_dword v2, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: v_min3_f32 v0, v2, v0, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmin3_olt_1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -266,7 +266,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -300,7 +300,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -334,37 +334,37 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s16, s8 -; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 ; GFX9-NEXT: s_mov_b32 s18, s2 ; GFX9-NEXT: s_mov_b32 s19, s3 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc +; GFX9-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmin3_olt_0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -404,7 +404,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -438,7 +438,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -472,37 +472,37 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s16, s8 -; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 ; GFX9-NEXT: s_mov_b32 s18, s2 ; GFX9-NEXT: s_mov_b32 s19, s3 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc +; GFX9-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: v_min3_f16 v0, v2, v0, v1 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_fmin3_olt_1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -604,7 +604,7 @@ entry: define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -638,7 +638,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -672,30 +672,28 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s16, s8 -; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 ; GFX9-NEXT: s_mov_b32 s18, s2 ; GFX9-NEXT: s_mov_b32 s19, s3 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc +; GFX9-NEXT: s_mov_b32 s4, s14 +; GFX9-NEXT: s_mov_b32 s5, s15 +; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] @@ -706,7 +704,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -749,7 +747,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -783,7 +781,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -817,30 +815,28 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s14, s2 -; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s16, s8 -; GFX9-NEXT: s_mov_b32 s17, s9 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_mov_b32 s16, s12 +; GFX9-NEXT: s_mov_b32 s17, s13 ; GFX9-NEXT: s_mov_b32 s18, s2 ; GFX9-NEXT: s_mov_b32 s19, s3 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc +; GFX9-NEXT: s_mov_b32 s4, s14 +; GFX9-NEXT: s_mov_b32 s5, s15 +; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] @@ -851,7 +847,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll index 85653ded63ce6f..42e618a94e9a45 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -26,7 +26,7 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_uge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -57,7 +57,7 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -78,7 +78,7 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ugt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -109,7 +109,7 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ule_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -130,7 +130,7 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ule_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -161,7 +161,7 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ult_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -182,7 +182,7 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ult_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -213,7 +213,7 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -234,7 +234,7 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_oge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -265,7 +265,7 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -286,7 +286,7 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ogt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -317,7 +317,7 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ole_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -338,7 +338,7 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ole_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -369,7 +369,7 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_olt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_olt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -390,7 +390,7 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_olt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll index 0353fc4f2b91b8..3bd82eca8ce955 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll @@ -263,16 +263,16 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fminimumi_f32_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS +; GCN-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS +; GCN-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_minimum_f32 v1, v1, v2 -; GCN-NEXT: global_store_b32 v0, v1, s[4:5] +; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -285,16 +285,16 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fminimum_f16_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS +; GCN-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_u16 v2, v0, s[0:1] scope:SCOPE_SYS +; GCN-NEXT: global_load_u16 v2, v0, s[4:5] scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: v_minimum_f16 v1, v1, v2 -; GCN-NEXT: global_store_b16 v0, v1, s[4:5] +; GCN-NEXT: global_store_b16 v0, v1, s[0:1] ; GCN-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 4 %b = load volatile half, ptr addrspace(1) %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 57a960207180c6..64be9cb72a6ee3 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -15,7 +15,7 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 { ; VI-LABEL: multiple_fadd_use_test_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f32_e64 v0, s3, -1.0 ; VI-NEXT: v_add_f32_e64 v1, s2, -1.0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: multiple_fadd_use_test_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v0, s3, -1.0 @@ -46,7 +46,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_fadd_use_test_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v0, s3, -1.0 @@ -77,9 +77,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, float %x, [8 x i32], float %y) #0 { ; VI-LABEL: multiple_use_fadd_fmac_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-NEXT: s_load_dword s3, s[6:7], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 +; VI-NEXT: s_load_dword s3, s[8:9], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s2, s0, 4 @@ -99,9 +99,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX10-LABEL: multiple_use_fadd_fmac_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-NEXT: s_load_dword s3, s[8:9], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v1, s2, s2 @@ -115,13 +115,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX11-LABEL: multiple_use_fadd_fmac_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v1, s4, s4 -; GFX11-NEXT: v_fma_f32 v2, s4, 2.0, s5 +; GFX11-NEXT: v_add_f32_e64 v1, s2, s2 +; GFX11-NEXT: v_fma_f32 v2, s2, 2.0, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc @@ -138,7 +138,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: multiple_use_fadd_fmad_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s4, s0, 4 @@ -157,7 +157,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: multiple_use_fadd_fmad_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v1, |s2|, |s2| @@ -170,7 +170,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_use_fadd_fmad_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, |s2|, |s2| @@ -192,8 +192,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 { ; VI-LABEL: multiple_use_fadd_multi_fmad_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 4 ; VI-NEXT: v_mov_b32_e32 v0, s1 @@ -214,8 +214,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; GFX10-LABEL: multiple_use_fadd_multi_fmad_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_fma_f32 v1, |s0|, 2.0, s1 @@ -229,15 +229,15 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; GFX11-LABEL: multiple_use_fadd_multi_fmad_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_fma_f32 v1, |s4|, 2.0, s5 -; GFX11-NEXT: v_fma_f32 v2, |s4|, 2.0, s6 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: v_fma_f32 v1, |s0|, 2.0, s1 +; GFX11-NEXT: v_fma_f32 v2, |s0|, 2.0, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc +; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 @@ -253,8 +253,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: fmul_x2_xn2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -267,8 +267,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX10-LABEL: fmul_x2_xn2_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f32_e64 v0, s2, -4.0 @@ -280,12 +280,12 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-LABEL: fmul_x2_xn2_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v0, s4, -4.0 +; GFX11-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm @@ -300,8 +300,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: fmul_x2_xn3_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 @@ -315,8 +315,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX10-LABEL: fmul_x2_xn3_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2 @@ -328,12 +328,12 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-LABEL: fmul_x2_xn3_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s4 +; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm @@ -348,8 +348,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { ; VI-DENORM-LABEL: multiple_fadd_use_test_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16 ; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -366,8 +366,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_fadd_use_test_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 ; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -384,13 +384,13 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX10-DENORM-LABEL: multiple_fadd_use_test_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-DENORM-NEXT: s_load_dword s0, s[8:9], 0x8 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX10-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX10-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, |v0|, |v0| @@ -402,12 +402,12 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX10-FLUSH-LABEL: multiple_fadd_use_test_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-FLUSH-NEXT: s_load_dword s0, s[8:9], 0x8 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX10-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |v0|, |v0| @@ -421,13 +421,13 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX11-DENORM-LABEL: multiple_fadd_use_test_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-DENORM-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_lshr_b32 s1, s0, 16 ; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX11-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -441,12 +441,12 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX11-FLUSH-LABEL: multiple_fadd_use_test_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-FLUSH-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 ; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX11-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -480,8 +480,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 @@ -501,8 +501,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -523,8 +523,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 @@ -539,8 +539,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 @@ -555,13 +555,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s4, s4 -; GFX11-DENORM-NEXT: v_fma_f16 v2, s4, 2.0, s2 +; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s2, s2 +; GFX11-DENORM-NEXT: v_fma_f16 v2, s2, 2.0, s3 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc @@ -571,12 +571,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s4, s4 -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc @@ -597,8 +597,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 @@ -618,8 +618,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 @@ -640,8 +640,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 @@ -656,8 +656,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| @@ -672,13 +672,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s4|, |s4| -; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s2 +; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s2|, |s2| +; GFX11-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, s3 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc @@ -688,12 +688,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc @@ -715,9 +715,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-DENORM-NEXT: s_load_dword s6, s[6:7], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; VI-DENORM-NEXT: s_load_dword s6, s[8:9], 0x8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 @@ -738,9 +738,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; ; VI-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-FLUSH-NEXT: s_load_dword s6, s[6:7], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; VI-FLUSH-NEXT: s_load_dword s6, s[8:9], 0x8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -762,9 +762,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX10-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x2 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GFX10-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s0, s0, 16 @@ -779,9 +779,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX10-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x2 -; GFX10-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| @@ -797,14 +797,14 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x2 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-DENORM-NEXT: s_load_b32 s6, s[4:5], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s1 -; GFX11-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s0 +; GFX11-DENORM-NEXT: v_fma_f16 v2, |s6|, 2.0, s1 +; GFX11-DENORM-NEXT: v_fma_f16 v1, |s6|, 2.0, s0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[2:3] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3] offset:2 dlc @@ -814,12 +814,12 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x2 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s6, s[4:5], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s6|, |s6| ; GFX11-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0 @@ -845,8 +845,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-LABEL: fmul_x2_xn2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 @@ -859,8 +859,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX10-LABEL: fmul_x2_xn2_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f16_e64 v0, s2, -4.0 @@ -872,13 +872,13 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-LABEL: fmul_x2_xn2_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e64 v0, s4, -4.0 +; GFX11-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm @@ -895,8 +895,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-LABEL: fmul_x2_xn3_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc600 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 @@ -910,8 +910,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX10-LABEL: fmul_x2_xn3_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f16_e64 v0, 0xc600, s2 @@ -923,13 +923,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-LABEL: fmul_x2_xn3_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s4 +; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll index e12c854f03c627..2cc5159c29f7ff 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -7,73 +7,73 @@ define amdgpu_kernel void @fmul_f16( ; SI-LABEL: fmul_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; GFX89-LABEL: fmul_f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX89-NEXT: s_mov_b32 s11, 0xf000 -; GFX89-NEXT: s_mov_b32 s10, -1 -; GFX89-NEXT: s_mov_b32 s14, s10 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s14, s6 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s12, s6 -; GFX89-NEXT: s_mov_b32 s13, s7 -; GFX89-NEXT: s_mov_b32 s15, s11 -; GFX89-NEXT: s_mov_b32 s2, s10 -; GFX89-NEXT: s_mov_b32 s3, s11 +; GFX89-NEXT: s_mov_b32 s12, s2 +; GFX89-NEXT: s_mov_b32 s13, s3 +; GFX89-NEXT: s_mov_b32 s15, s7 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 ; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s4 -; GFX89-NEXT: s_mov_b32 s9, s5 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX89-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm @@ -91,7 +91,7 @@ entry: define amdgpu_kernel void @fmul_f16_imm_a( ; SI-LABEL: fmul_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -109,45 +109,27 @@ define amdgpu_kernel void @fmul_f16_imm_a( ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: fmul_f16_imm_a: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_mul_f16_e32 v0, 0x4200, v0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fmul_f16_imm_a: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_mul_f16_e32 v0, 0x4200, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fmul_f16_imm_a: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -174,7 +156,7 @@ entry: define amdgpu_kernel void @fmul_f16_imm_b( ; SI-LABEL: fmul_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -192,45 +174,27 @@ define amdgpu_kernel void @fmul_f16_imm_b( ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: fmul_f16_imm_b: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fmul_f16_imm_b: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_mul_f16_e32 v0, 4.0, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fmul_f16_imm_b: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -257,21 +221,21 @@ entry: define amdgpu_kernel void @fmul_v2f16( ; SI-LABEL: fmul_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -286,73 +250,73 @@ define amdgpu_kernel void @fmul_v2f16( ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fmul_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fmul_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s14, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -371,7 +335,7 @@ entry: define amdgpu_kernel void @fmul_v2f16_imm_a( ; SI-LABEL: fmul_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -397,7 +361,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; ; VI-LABEL: fmul_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -418,26 +382,26 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; ; GFX9-LABEL: fmul_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x44004200 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x44004200 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -464,7 +428,7 @@ entry: define amdgpu_kernel void @fmul_v2f16_imm_b( ; SI-LABEL: fmul_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -490,7 +454,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; ; VI-LABEL: fmul_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -511,26 +475,26 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; ; GFX9-LABEL: fmul_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x42004400 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x42004400 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -557,21 +521,21 @@ entry: define amdgpu_kernel void @fmul_v4f16( ; SI-LABEL: fmul_v4f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s10 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -598,26 +562,26 @@ define amdgpu_kernel void @fmul_v4f16( ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fmul_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f16_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v1, v3, v1 @@ -625,50 +589,50 @@ define amdgpu_kernel void @fmul_v4f16( ; VI-NEXT: v_mul_f16_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fmul_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1 ; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1 ; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0 @@ -688,7 +652,7 @@ entry: define amdgpu_kernel void @fmul_v4f16_imm_a( ; SI-LABEL: fmul_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -723,7 +687,7 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; ; VI-LABEL: fmul_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -747,28 +711,28 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; ; GFX9-LABEL: fmul_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s6, 0x44004200 -; GFX9-NEXT: s_mov_b32 s7, 0x40004800 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s2, 0x44004200 +; GFX9-NEXT: s_mov_b32 s3, 0x40004800 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v1, v1, s6 -; GFX9-NEXT: v_pk_mul_f16 v0, v0, s7 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, s2 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s3 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v4f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 8298a925343ba3..a753e38b04abf4 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -22,7 +22,7 @@ declare half @llvm.fabs.f16(half) #1 define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmuladd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -42,7 +42,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-DENORM-LABEL: fmuladd_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 @@ -62,37 +62,37 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-FLUSH-LABEL: fmuladd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9] -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11] +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[10:11] +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[12:13] +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[14:15] ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[8:9] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmuladd_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 -; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9] -; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11] +; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[10:11] +; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[12:13] +; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[14:15] ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5] +; GFX10-DENORM-NEXT: global_store_short v0, v3, s[8:9] ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmuladd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -109,7 +109,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-DENORM-LABEL: fmuladd_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x2 @@ -132,7 +132,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmul_fadd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -152,7 +152,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -172,53 +172,53 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-FLUSH-LABEL: fmul_fadd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9] -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11] +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[10:11] +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[12:13] +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[14:15] ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[8:9] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: s_clause 0x2 -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[8:9] -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[10:11] +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[10:11] +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[12:13] +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[14:15] ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[8:9] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2 -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[8:9] -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[10:11] +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[10:11] +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[12:13] +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[14:15] ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[8:9] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmul_fadd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -235,7 +235,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: s_clause 0x2 @@ -252,7 +252,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: s_clause 0x2 @@ -276,7 +276,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmul_fadd_contract_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -296,7 +296,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; VI-DENORM-LABEL: fmul_fadd_contract_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 @@ -316,37 +316,37 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9] -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11] +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[10:11] +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[12:13] +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[14:15] ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[8:9] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 -; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9] -; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11] +; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[10:11] +; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[12:13] +; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[14:15] ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5] +; GFX10-DENORM-NEXT: global_store_short v0, v3, s[8:9] ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -363,7 +363,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX11-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x2 @@ -387,7 +387,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -405,7 +405,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -423,7 +423,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -437,7 +437,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -450,7 +450,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -467,7 +467,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -495,7 +495,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -513,7 +513,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -531,7 +531,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -545,7 +545,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -558,7 +558,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -575,7 +575,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -603,7 +603,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; VI-FLUSH-LABEL: fadd_a_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -621,7 +621,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -639,7 +639,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-FLUSH-LABEL: fadd_a_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -653,7 +653,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -667,7 +667,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -680,7 +680,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-FLUSH-LABEL: fadd_a_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -697,7 +697,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -714,7 +714,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -745,7 +745,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; VI-FLUSH-LABEL: fadd_b_a_a_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -763,7 +763,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -781,7 +781,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-FLUSH-LABEL: fadd_b_a_a_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -795,7 +795,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -809,7 +809,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -822,7 +822,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-FLUSH-LABEL: fadd_b_a_a_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -839,7 +839,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -856,7 +856,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -887,7 +887,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -905,7 +905,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -923,7 +923,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -937,7 +937,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -950,7 +950,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -967,7 +967,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -995,7 +995,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1013,7 +1013,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1045,7 +1045,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1075,7 +1075,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1105,7 +1105,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1141,7 +1141,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1155,7 +1155,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1168,7 +1168,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1215,7 +1215,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1251,7 +1251,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1278,7 +1278,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1295,7 +1295,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1325,7 +1325,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1350,7 +1350,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1375,54 +1375,54 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-FLUSH-LABEL: mad_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1441,7 +1441,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1460,7 +1460,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1494,7 +1494,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1519,7 +1519,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1544,54 +1544,54 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: mad_sub_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v3, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v3, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1610,7 +1610,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1629,7 +1629,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1663,7 +1663,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1688,7 +1688,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1713,54 +1713,54 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, v1, |v3| -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, v1, |v3| -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -|v3| -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1779,7 +1779,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1798,7 +1798,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1833,7 +1833,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1858,7 +1858,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1883,54 +1883,54 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, |v3|, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, |v3|, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, |v3| -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1949,7 +1949,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1968,7 +1968,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2003,7 +2003,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: neg_neg_mad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -2028,7 +2028,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -2053,54 +2053,54 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: neg_neg_mad_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v3, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v3, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: neg_neg_mad_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2119,7 +2119,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2138,7 +2138,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2174,7 +2174,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_fabs_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -2199,7 +2199,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -2224,54 +2224,54 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e64 v1, v1, |v2| ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e64 v1, v1, |v2| ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, |v2|, -v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2290,7 +2290,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2309,7 +2309,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2344,7 +2344,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -2362,7 +2362,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -2380,7 +2380,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2394,7 +2394,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2408,7 +2408,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2421,7 +2421,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2438,7 +2438,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2455,7 +2455,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2485,7 +2485,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -2503,7 +2503,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -2521,7 +2521,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2535,7 +2535,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2549,7 +2549,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2562,7 +2562,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2579,7 +2579,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -2596,7 +2596,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index 435d5bd79d095c..fe5601594dca8d 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -15,12 +15,12 @@ declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; SI-LABEL: fnearbyint_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; SI-NEXT: v_rndne_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -28,12 +28,12 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; ; CI-LABEL: fnearbyint_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0xb -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s0, s[4:5], 0xb ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_rndne_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -42,10 +42,10 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; ; VI-LABEL: fnearbyint_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f16_e32 v2, s4 +; VI-NEXT: v_rndne_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -54,11 +54,11 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; GFX11-LABEL: fnearbyint_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f16_e32 v1, s4 +; GFX11-NEXT: v_rndne_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %1 = call half @llvm.nearbyint.f16(half %in) @@ -69,21 +69,21 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; SICI-LABEL: fnearbyint_f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dword s4, s[2:3], 0xb -; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SICI-NEXT: s_load_dword s6, s[4:5], 0xb +; SICI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SICI-NEXT: s_mov_b32 s3, 0xf000 ; SICI-NEXT: s_mov_b32 s2, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) -; SICI-NEXT: v_rndne_f32_e32 v0, s4 +; SICI-NEXT: v_rndne_f32_e32 v0, s6 ; SICI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SICI-NEXT: s_endpgm ; ; VI-LABEL: fnearbyint_f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f32_e32 v2, s4 +; VI-NEXT: v_rndne_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -92,11 +92,11 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; GFX11-LABEL: fnearbyint_f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f32_e32 v1, s4 +; GFX11-NEXT: v_rndne_f32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -108,7 +108,7 @@ entry: define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %in) #1 { ; SICI-LABEL: fnearbyint_v2f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SICI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SICI-NEXT: s_mov_b32 s7, 0xf000 ; SICI-NEXT: s_mov_b32 s6, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -121,7 +121,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fnearbyint_v2f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_rndne_f32_e32 v1, s3 @@ -132,7 +132,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; GFX11-LABEL: fnearbyint_v2f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v1, s3 @@ -148,44 +148,44 @@ entry: define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %in) #1 { ; SICI-LABEL: fnearbyint_v4f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SICI-NEXT: s_mov_b32 s3, 0xf000 -; SICI-NEXT: s_mov_b32 s2, -1 +; SICI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SICI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SICI-NEXT: s_mov_b32 s7, 0xf000 +; SICI-NEXT: s_mov_b32 s6, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) -; SICI-NEXT: v_rndne_f32_e32 v3, s7 -; SICI-NEXT: v_rndne_f32_e32 v2, s6 -; SICI-NEXT: v_rndne_f32_e32 v1, s5 -; SICI-NEXT: v_rndne_f32_e32 v0, s4 -; SICI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SICI-NEXT: v_rndne_f32_e32 v3, s3 +; SICI-NEXT: v_rndne_f32_e32 v2, s2 +; SICI-NEXT: v_rndne_f32_e32 v1, s1 +; SICI-NEXT: v_rndne_f32_e32 v0, s0 +; SICI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SICI-NEXT: s_endpgm ; ; VI-LABEL: fnearbyint_v4f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_rndne_f32_e32 v2, s6 -; VI-NEXT: v_rndne_f32_e32 v1, s5 -; VI-NEXT: v_rndne_f32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_rndne_f32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_rndne_f32_e32 v2, s2 +; VI-NEXT: v_rndne_f32_e32 v1, s1 +; VI-NEXT: v_rndne_f32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fnearbyint_v4f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f32_e32 v3, s7 -; GFX11-NEXT: v_rndne_f32_e32 v2, s6 -; GFX11-NEXT: v_rndne_f32_e32 v1, s5 -; GFX11-NEXT: v_rndne_f32_e32 v0, s4 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: v_rndne_f32_e32 v3, s3 +; GFX11-NEXT: v_rndne_f32_e32 v2, s2 +; GFX11-NEXT: v_rndne_f32_e32 v1, s1 +; GFX11-NEXT: v_rndne_f32_e32 v0, s0 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in) @@ -196,7 +196,7 @@ entry: define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; SI-LABEL: nearbyint_f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_brev_b32 s8, -2 @@ -220,7 +220,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; CI-LABEL: nearbyint_f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s3, 0xf000 @@ -230,7 +230,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; VI-LABEL: nearbyint_f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -240,7 +240,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; GFX11-LABEL: nearbyint_f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] @@ -254,8 +254,8 @@ entry: define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: nearbyint_v2f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_brev_b32 s10, -2 @@ -287,23 +287,25 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; ; CI-LABEL: nearbyint_v2f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_rndne_f64_e32 v[2:3], s[2:3] +; CI-NEXT: v_rndne_f64_e32 v[0:1], s[0:1] +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] -; CI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: nearbyint_v2f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_rndne_f64_e32 v[2:3], s[2:3] +; VI-NEXT: v_rndne_f64_e32 v[0:1], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] -; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -311,13 +313,13 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; ; GFX11-LABEL: nearbyint_v2f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] -; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[2:3] +; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[0:1] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -329,8 +331,8 @@ entry: define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: nearbyint_v4f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_brev_b32 s14, -2 @@ -379,28 +381,28 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; CI-LABEL: nearbyint_v4f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] -; CI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] -; CI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] -; CI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; CI-NEXT: v_rndne_f64_e32 v[6:7], s[14:15] +; CI-NEXT: v_rndne_f64_e32 v[4:5], s[12:13] +; CI-NEXT: v_rndne_f64_e32 v[2:3], s[10:11] +; CI-NEXT: v_rndne_f64_e32 v[0:1], s[8:9] ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: nearbyint_v4f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] -; VI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] -; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] -; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; VI-NEXT: v_rndne_f64_e32 v[6:7], s[14:15] +; VI-NEXT: v_rndne_f64_e32 v[4:5], s[12:13] +; VI-NEXT: v_rndne_f64_e32 v[2:3], s[10:11] +; VI-NEXT: v_rndne_f64_e32 v[0:1], s[8:9] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v11, s3 @@ -414,14 +416,14 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; GFX11-LABEL: nearbyint_v4f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] -; GFX11-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] -; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] -; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] +; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[14:15] +; GFX11-NEXT: v_rndne_f64_e32 v[4:5], s[12:13] +; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[10:11] +; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[8:9] ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 74e2b9ea714258..9a72fe96b5c3af 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2799,7 +2799,7 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -2813,7 +2813,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fneg_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3016,12 +3016,12 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitcmp1_b32 s4, 0 +; SI-NEXT: s_bitcmp1_b32 s6, 0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec @@ -3036,12 +3036,12 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 ; ; VI-LABEL: s_fneg_select_infloop_regression_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s4, 0 +; VI-NEXT: s_bitcmp1_b32 s6, 0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec @@ -3080,11 +3080,11 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_bitcmp1_b32 s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_bitcmp1_b32 s2, 16 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3] @@ -3096,11 +3096,11 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a ; ; VI-LABEL: s_fneg_select_infloop_regression_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s4, 16 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_bitcmp1_b32 s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 @@ -3146,7 +3146,7 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s1, 1, s1 ; SI-NEXT: s_cselect_b32 s0, 0, s0 @@ -3161,7 +3161,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, 1, s1 ; VI-NEXT: s_cselect_b32 s0, 0, s0 @@ -3216,39 +3216,39 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitcmp1_b32 s6, 0 -; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: s_bitcmp1_b32 s2, 0 +; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[2:3] -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[2:3] -; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s6, 0 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_bitcmp1_b32 s2, 0 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[2:3] ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3] ; VI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %i = select i1 %arg1, <2 x float> zeroinitializer, <2 x float> %arg @@ -3279,7 +3279,7 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fabs_select_infloop_regression_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -3293,7 +3293,7 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fabs_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3329,7 +3329,7 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_fabs_select_infloop_regression: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -3343,7 +3343,7 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 ; ; VI-LABEL: s_fneg_fabs_select_infloop_regression: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index b821f9968490e3..d6f6d440f9a835 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -7,12 +7,12 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) { ; CI-LABEL: fneg_fabs_fadd_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[8:9], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -23,8 +23,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; ; VI-LABEL: fneg_fabs_fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -36,8 +36,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; ; GFX9-LABEL: fneg_fabs_fadd_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -49,13 +49,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-LABEL: fneg_fabs_fadd_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_sub_f16_e64 v1, s2, |s4| +; GFX11-NEXT: v_sub_f16_e64 v1, s3, |s2| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %x) @@ -68,13 +68,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) { ; CI-LABEL: fneg_fabs_fmul_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[8:9], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s1, s0, 0x7fff ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s1| -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -85,8 +85,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; ; VI-LABEL: fneg_fabs_fmul_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -98,8 +98,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; ; GFX9-LABEL: fneg_fabs_fmul_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -111,13 +111,13 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-LABEL: fneg_fabs_fmul_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f16_e64 v1, s2, -|s4| +; GFX11-NEXT: v_mul_f16_e64 v1, s3, -|s2| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %x) @@ -133,8 +133,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI-LABEL: fneg_fabs_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -145,8 +145,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: fneg_fabs_free_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -157,8 +157,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; GFX9-LABEL: fneg_fabs_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bitset1_b32 s2, 15 @@ -169,10 +169,10 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: fneg_fabs_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -187,8 +187,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI-LABEL: fneg_fabs_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -199,8 +199,8 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: fneg_fabs_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -211,8 +211,8 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; GFX9-LABEL: fneg_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bitset1_b32 s2, 15 @@ -223,10 +223,10 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: fneg_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -240,7 +240,7 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CIVI-LABEL: v_fneg_fabs_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -254,7 +254,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_fneg_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -265,7 +265,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_fneg_fabs_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -283,12 +283,12 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[8:9], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_add_f32_e32 v1, 2.0, v1 ; CI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -304,8 +304,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; ; VI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 @@ -321,8 +321,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; ; GFX9-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -334,11 +334,11 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; GFX11-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s4 +; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -355,8 +355,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fneg_fabs_v2f16_bc_src: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -367,8 +367,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: s_fneg_fabs_v2f16_bc_src: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -379,8 +379,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_fneg_fabs_v2f16_bc_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -391,10 +391,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; GFX11-LABEL: s_fneg_fabs_v2f16_bc_src: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -408,7 +408,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CIVI-LABEL: fneg_fabs_v4f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -421,7 +421,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX9-LABEL: fneg_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s3, s3, 0x80008000 @@ -433,7 +433,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX11-LABEL: fneg_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_or_b32 s3, s3, 0x80008000 @@ -451,12 +451,12 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: fold_user_fneg_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[8:9], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1 ; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -471,8 +471,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fold_user_fneg_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 @@ -487,8 +487,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: fold_user_fneg_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -499,11 +499,11 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; GFX11-LABEL: fold_user_fneg_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -518,8 +518,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { ; CI-LABEL: s_fneg_multi_use_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -535,8 +535,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; ; VI-LABEL: s_fneg_multi_use_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -552,8 +552,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; ; GFX9-LABEL: s_fneg_multi_use_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff @@ -567,10 +567,10 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; GFX11-LABEL: s_fneg_multi_use_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s4, s6, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: s_xor_b32 s5, s4, 0x80008000 @@ -589,8 +589,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { ; CI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010 @@ -613,8 +613,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; ; VI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_mov_b32_e32 v5, 0xc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -634,8 +634,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; ; GFX9-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff @@ -648,10 +648,10 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; GFX11-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s4, s6, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: v_pk_mul_f16 v2, s4, -4.0 op_sel_hi:[1,0] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll index d0115523b18823..52b6d2cbaa6ebe 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -5,29 +5,29 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, double %y) { ; SI-LABEL: fneg_fabs_fadd_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -|v[0:1]| -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fneg_fabs_fadd_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -|v[0:1]| -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_f64 v[0:1], s[4:5], -|v[0:1]| +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %x) @@ -40,7 +40,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrspace(1) %xptr, ptr addrspace(1) %yptr) { ; SI-LABEL: v_fneg_fabs_fadd_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -52,7 +52,7 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: v_fneg_fabs_fadd_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -73,29 +73,29 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, double %y) { ; SI-LABEL: fneg_fabs_fmul_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_mul_f64 v[0:1], s[8:9], -|v[0:1]| -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fneg_fabs_fmul_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mul_f64 v[0:1], s[0:1], -|v[0:1]| -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mul_f64 v[0:1], s[4:5], -|v[0:1]| +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %x) @@ -108,7 +108,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: fneg_fabs_free_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s3, 31 @@ -122,7 +122,7 @@ define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: fneg_fabs_free_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_or_b32 s0, s3, 0x80000000 @@ -141,7 +141,7 @@ define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: fneg_fabs_fn_free_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s3, 31 @@ -155,7 +155,7 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: fneg_fabs_fn_free_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_or_b32 s0, s3, 0x80000000 @@ -174,21 +174,21 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %in) { ; SI-LABEL: fneg_fabs_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s5, 31 +; SI-NEXT: s_or_b32 s4, s7, 0x80000000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fneg_fabs_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s1, 31 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -206,33 +206,33 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: fneg_fabs_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s7, 31 -; SI-NEXT: s_bitset1_b32 s5, 31 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_bitset1_b32 s3, 31 +; SI-NEXT: s_bitset1_b32 s1, 31 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fneg_fabs_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s7, 0x80000000 -; VI-NEXT: s_or_b32 s3, s5, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_bitset1_b32 s3, 31 +; VI-NEXT: s_bitset1_b32 s1, 31 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) @@ -244,50 +244,50 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> % define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: fneg_fabs_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s7, 31 -; SI-NEXT: s_bitset1_b32 s11, 31 -; SI-NEXT: s_bitset1_b32 s9, 31 -; SI-NEXT: s_bitset1_b32 s5, 31 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_mov_b32_e32 v6, s6 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: s_or_b32 s4, s11, 0x80000000 +; SI-NEXT: s_or_b32 s5, s15, 0x80000000 +; SI-NEXT: s_or_b32 s6, s13, 0x80000000 +; SI-NEXT: s_or_b32 s7, s9, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fneg_fabs_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s7, 31 -; VI-NEXT: s_bitset1_b32 s5, 31 -; VI-NEXT: s_or_b32 s2, s11, 0x80000000 -; VI-NEXT: s_or_b32 s3, s9, 0x80000000 +; VI-NEXT: s_or_b32 s4, s11, 0x80000000 +; VI-NEXT: s_or_b32 s5, s9, 0x80000000 +; VI-NEXT: s_or_b32 s2, s15, 0x80000000 +; VI-NEXT: s_or_b32 s3, s13, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index 6446145bbfe2ad..17e509acfb6e63 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) { ; SI-LABEL: fneg_fabsf_fadd_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fadd_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_sub_f32_e64 v2, s3, |v0| @@ -36,7 +36,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) { ; SI-LABEL: fneg_fabsf_fmul_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fmul_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mul_f32_e64 v2, s3, -|v0| @@ -67,11 +67,11 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_fabsf_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_or_b32 s4, s2, 0x80000000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -79,10 +79,10 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fneg_fabsf_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -98,11 +98,11 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_fabsf_fn_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_or_b32 s4, s2, 0x80000000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -110,10 +110,10 @@ define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: fneg_fabsf_fn_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -129,11 +129,11 @@ define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_or_b32 s4, s2, 0x80000000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -141,10 +141,10 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fneg_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -159,7 +159,7 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_fneg_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: v_fneg_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fneg_fabsf_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s3, 31 @@ -213,7 +213,7 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fneg_fabsf_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s3, 31 ; VI-NEXT: s_bitset1_b32 s2, 31 @@ -232,37 +232,37 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-LABEL: fneg_fabsf_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s7, 31 -; SI-NEXT: s_bitset1_b32 s6, 31 -; SI-NEXT: s_bitset1_b32 s5, 31 -; SI-NEXT: s_bitset1_b32 s4, 31 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_bitset1_b32 s3, 31 +; SI-NEXT: s_bitset1_b32 s2, 31 +; SI-NEXT: s_bitset1_b32 s1, 31 +; SI-NEXT: s_bitset1_b32 s0, 31 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fneg_fabsf_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s7, 0x80000000 -; VI-NEXT: s_or_b32 s3, s6, 0x80000000 -; VI-NEXT: s_bitset1_b32 s5, 31 -; VI-NEXT: s_bitset1_b32 s4, 31 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_bitset1_b32 s3, 31 +; VI-NEXT: s_bitset1_b32 s2, 31 +; VI-NEXT: s_bitset1_b32 s1, 31 +; VI-NEXT: s_bitset1_b32 s0, 31 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 4412e04e121a96..b2d30b751ae2c4 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1475,11 +1475,11 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_fo define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i1 %z, ptr addrspace(1) %dst) { ; GFX7-LABEL: multiple_uses_fneg_select_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s8, s[6:7], 0x4 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x6 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x4 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bitcmp1_b32 s8, 0 +; GFX7-NEXT: s_bitcmp1_b32 s6, 0 ; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX7-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX7-NEXT: v_mov_b32_e32 v0, s3 @@ -1497,12 +1497,12 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; ; GFX9-LABEL: multiple_uses_fneg_select_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s8, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18 +; GFX9-NEXT: s_load_dword s6, s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bitcmp1_b32 s8, 0 +; GFX9-NEXT: s_bitcmp1_b32 s6, 0 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -1519,22 +1519,22 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX11-LABEL: multiple_uses_fneg_select_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s8, s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x18 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x10 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x18 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s5 -; GFX11-NEXT: s_bitcmp1_b32 s8, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: s_bitcmp1_b32 s6, 0 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, s7, v0, vcc_lo -; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s2, s5, s7 -; GFX11-NEXT: s_cselect_b32 s3, s4, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v1, s2, -v0, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v0, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo +; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s1, s1, s3 +; GFX11-NEXT: s_cselect_b32 s0, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, -v0, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm %a = select i1 %z, double %x, double %y %b = fneg double %a @@ -1547,7 +1547,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) { ; GCN-LABEL: fnge_select_f32_multi_use_regression: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -1560,7 +1560,7 @@ define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) { ; ; GFX11-LABEL: fnge_select_f32_multi_use_regression: ; GFX11: ; %bb.0: ; %.entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, s0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index b9dd2727b36784..16150da4063e69 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; CI-LABEL: s_fneg_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -20,8 +20,8 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; ; GFX8-LABEL: s_fneg_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -32,8 +32,8 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; ; GFX9-LABEL: s_fneg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 @@ -44,10 +44,10 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; GFX11-LABEL: s_fneg_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -62,7 +62,7 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -76,7 +76,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX8-LABEL: v_fneg_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -90,7 +90,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_fneg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -101,7 +101,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX11-LABEL: v_fneg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -123,8 +123,8 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; CI-LABEL: s_fneg_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -135,8 +135,8 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; ; GFX8-LABEL: s_fneg_free_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -147,8 +147,8 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; ; GFX9-LABEL: s_fneg_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 @@ -159,10 +159,10 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; GFX11-LABEL: s_fneg_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -176,7 +176,7 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_fold_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -193,7 +193,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_fneg_fold_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -207,7 +207,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_fneg_fold_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -218,7 +218,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_fneg_fold_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -236,8 +236,8 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: s_fneg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -248,8 +248,8 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; ; GFX8-LABEL: s_fneg_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -260,8 +260,8 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; ; GFX9-LABEL: s_fneg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 @@ -272,10 +272,10 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; GFX11-LABEL: s_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -288,7 +288,7 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; CIVI-LABEL: s_fneg_v2f16_nonload: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CIVI-NEXT: ;;#ASMSTART ; CIVI-NEXT: ; def s2 ; CIVI-NEXT: ;;#ASMEND @@ -302,7 +302,7 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; ; GFX9-LABEL: s_fneg_v2f16_nonload: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s2 ; GFX9-NEXT: ;;#ASMEND @@ -315,7 +315,7 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; ; GFX11-LABEL: s_fneg_v2f16_nonload: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s2 ; GFX11-NEXT: ;;#ASMEND @@ -335,7 +335,7 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -349,7 +349,7 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: v_fneg_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -363,7 +363,7 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_fneg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -374,7 +374,7 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_fneg_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -396,8 +396,8 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-LABEL: fneg_free_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -408,8 +408,8 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX8-LABEL: fneg_free_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -420,8 +420,8 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX9-LABEL: fneg_free_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 @@ -432,10 +432,10 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX11-LABEL: fneg_free_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -449,7 +449,7 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -475,7 +475,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: v_fneg_fold_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -491,7 +491,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_fneg_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -502,7 +502,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: v_fneg_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -520,7 +520,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fneg_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -541,7 +541,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX8-LABEL: v_extract_fneg_fold_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -558,7 +558,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX9-LABEL: v_extract_fneg_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -574,7 +574,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX11-LABEL: v_extract_fneg_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] @@ -603,7 +603,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 { ; CIVI-LABEL: v_extract_fneg_no_fold_v2f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -619,7 +619,7 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX9-LABEL: v_extract_fneg_no_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] @@ -633,7 +633,7 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX11-LABEL: v_extract_fneg_no_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index d8809132883a9e..87f1303ab8f5d9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -7,22 +7,22 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fneg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b32 s4, s4, 0x80000000 +; SI-NEXT: s_xor_b32 s4, s6, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -32,10 +32,10 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; GFX11-LABEL: s_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -48,7 +48,7 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) { ; SI-LABEL: s_fneg_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -63,7 +63,7 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; VI-LABEL: s_fneg_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 @@ -76,7 +76,7 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; GFX11-LABEL: s_fneg_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 @@ -93,55 +93,55 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) { ; SI-LABEL: s_fneg_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b32 s7, s7, 0x80000000 -; SI-NEXT: s_xor_b32 s6, s6, 0x80000000 -; SI-NEXT: s_xor_b32 s5, s5, 0x80000000 -; SI-NEXT: s_xor_b32 s4, s4, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_xor_b32 s3, s3, 0x80000000 +; SI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; SI-NEXT: s_xor_b32 s1, s1, 0x80000000 +; SI-NEXT: s_xor_b32 s0, s0, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s7, 0x80000000 -; VI-NEXT: s_xor_b32 s3, s6, 0x80000000 -; VI-NEXT: s_xor_b32 s5, s5, 0x80000000 -; VI-NEXT: s_xor_b32 s4, s4, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 +; VI-NEXT: s_xor_b32 s1, s1, 0x80000000 +; VI-NEXT: s_xor_b32 s0, s0, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000 -; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000 -; GFX11-NEXT: s_xor_b32 s4, s4, 0x80000000 -; GFX11-NEXT: s_xor_b32 s5, s5, 0x80000000 +; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_xor_b32 s0, s0, 0x80000000 +; GFX11-NEXT: s_xor_b32 s1, s1, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2 -; GFX11-NEXT: v_mov_b32_e32 v2, s3 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm %fneg = fsub <4 x float> , %in store <4 x float> %fneg, ptr addrspace(1) %out @@ -151,21 +151,21 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fsub0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_sub_f32_e64 v0, 0, s4 +; SI-NEXT: v_sub_f32_e64 v0, 0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fsub0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f32_e64 v2, 0, s4 +; VI-NEXT: v_sub_f32_e64 v2, 0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -174,11 +174,11 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: fsub0_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 0, s4 +; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to float @@ -190,22 +190,22 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b32 s4, s4, 0x80000000 +; SI-NEXT: s_xor_b32 s4, s6, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fneg_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -215,10 +215,10 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: fneg_free_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -232,21 +232,21 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fold_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, -s4, s4 +; SI-NEXT: v_mul_f32_e64 v0, -s6, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fneg_fold_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, -s4, s4 +; VI-NEXT: v_mul_f32_e64 v2, -s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -255,11 +255,11 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; GFX11-LABEL: fneg_fold_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v1, -s4, s4 +; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %fsub = fsub float -0.0, %in @@ -272,21 +272,21 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fneg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, s4, -4.0 +; SI-NEXT: v_mul_f32_e64 v0, s6, -4.0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bitpreserve_fneg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, s4, -4.0 +; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -295,11 +295,11 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ; GFX11-LABEL: bitpreserve_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v1, s4, -4.0 +; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %in.bc = bitcast float %in to i32 @@ -313,22 +313,22 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b32 s4, s4, 0x80000000 +; SI-NEXT: s_xor_b32 s4, s6, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -338,10 +338,10 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: s_fneg_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -364,21 +364,21 @@ define i32 @v_fneg_i32(i32 %in) { define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_sub_f32_e64 v0, 2.0, s4 +; SI-NEXT: v_sub_f32_e64 v0, 2.0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_i32_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f32_e64 v2, 2.0, s4 +; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -387,11 +387,11 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: s_fneg_i32_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s4 +; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %fneg = xor i32 %in, -2147483648 @@ -416,7 +416,7 @@ define float @v_fneg_i32_fp_use(i32 %in) { define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -430,7 +430,7 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: s_fneg_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_xor_b32 s0, s3, 0x80000000 @@ -442,7 +442,7 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX11-LABEL: s_fneg_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -468,7 +468,7 @@ define i64 @v_fneg_i64(i64 %in) { define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -480,7 +480,7 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: s_fneg_i64_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -490,7 +490,7 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; ; GFX11-LABEL: s_fneg_i64_fp_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 @@ -528,12 +528,12 @@ define i16 @v_fneg_i16(i16 %in) { define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; SI-LABEL: s_fneg_i16_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -542,10 +542,10 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: s_fneg_i16_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f16_e64 v2, 2.0, s4 +; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -554,11 +554,11 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: s_fneg_i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s4 +; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %fneg = xor i16 %in, -32768 @@ -596,27 +596,27 @@ define half @v_fneg_i16_fp_use(i16 %in) { define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b32 s4, s4, 0x80008000 +; SI-NEXT: s_xor_b32 s4, s6, 0x80008000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: s_xor_b32 s3, s4, 0x8000 +; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_xor_b32 s3, s3, 0x8000 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -626,10 +626,10 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; GFX11-LABEL: s_fneg_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -670,18 +670,18 @@ define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) { define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -690,15 +690,15 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) ; ; VI-LABEL: s_fneg_v2i16_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_xor_b32 s3, s3, 0x8000 ; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_xor_b32 s3, s4, 0x8000 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_add_f16_e64 v1, s3, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_f16_e64 v1, s2, 2.0 ; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -709,11 +709,11 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) ; GFX11-LABEL: s_fneg_v2i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, s4, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %in = bitcast i32 %arg to <2 x i16> diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 58a6c2ab4bf030..4216bdf409edaf 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -8,7 +8,7 @@ declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> % define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -20,7 +20,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -36,7 +36,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -48,7 +48,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -92,7 +92,7 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -102,7 +102,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; ; GFX12-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: ds_pk_add_f16 v0, v1 @@ -140,7 +140,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -150,7 +150,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; ; GFX12-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 038e7b4f5e2bb8..b1c8107c3d1dd4 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -9,8 +9,8 @@ declare double @llvm.fabs.f64(double) #1 define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isinf_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x204 @@ -22,11 +22,11 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f ; ; VI-LABEL: test_isinf_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -36,11 +36,11 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f ; GFX11-LABEL: test_isinf_pattern: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x204 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -55,8 +55,8 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_not_isinf_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 @@ -68,11 +68,11 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % ; ; VI-LABEL: test_not_isinf_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s4|, v0 +; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -82,11 +82,11 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % ; GFX11-LABEL: test_not_isinf_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s4| +; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -101,7 +101,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_not_isinf_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -111,7 +111,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; ; VI-LABEL: test_not_isinf_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -121,7 +121,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; ; GFX11-LABEL: test_not_isinf_pattern_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -136,8 +136,8 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 @@ -149,11 +149,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o ; ; VI-LABEL: test_isfinite_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -163,11 +163,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -184,8 +184,8 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 @@ -197,11 +197,11 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o ; ; VI-LABEL: test_isfinite_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -211,11 +211,11 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -231,22 +231,22 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_o_f32_e64 s[4:5], s4, s4 +; SI-NEXT: v_cmp_o_f32_e64 s[4:5], s6, s6 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_not_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 +; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -256,11 +256,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s4 +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -278,8 +278,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 @@ -293,12 +293,12 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 -; VI-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s6, s6 +; VI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 ; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] @@ -309,14 +309,14 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s4 -; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, s4 +; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -332,7 +332,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocapture %out, float %x, float %y) #0 { ; SI-LABEL: test_isfinite_not_pattern_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 @@ -348,7 +348,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2 @@ -362,7 +362,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; GFX11-LABEL: test_isfinite_not_pattern_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 @@ -385,8 +385,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 @@ -400,12 +400,12 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s4, s4 -; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s4|, v0 +; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s6, s6 +; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s6|, v0 ; VI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] @@ -416,14 +416,14 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s2, s4, s4 -; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s4| +; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -439,8 +439,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 @@ -452,11 +452,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o ; ; VI-LABEL: test_isfinite_pattern_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -466,11 +466,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -487,8 +487,8 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_commute_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 @@ -500,11 +500,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) ; ; VI-LABEL: test_isfinite_pattern_4_commute_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -514,11 +514,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) ; GFX11-LABEL: test_isfinite_pattern_4_commute_and: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -535,11 +535,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrspace(1) nocapture %out, float %x, [8 x i32], float %y) #0 { ; SI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: s_load_dword s1, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s0, s[4:5], 0x14 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; SI-NEXT: s_load_dword s1, s[4:5], 0xb +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 @@ -547,14 +547,14 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s1, v0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x50 -; VI-NEXT: s_load_dword s1, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x50 +; VI-NEXT: s_load_dword s1, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s0 @@ -570,15 +570,15 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; GFX11-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x50 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x50 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s3, s4, 0x1f8 -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s5 +; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -594,12 +594,12 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isinf_pattern_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0x7fff +; SI-NEXT: s_and_b32 s4, s6, 0x7fff ; SI-NEXT: s_cmpk_eq_i32 s4, 0x7c00 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] @@ -608,11 +608,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; ; VI-LABEL: test_isinf_pattern_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -622,11 +622,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; GFX11-LABEL: test_isinf_pattern_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x204 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -641,13 +641,13 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x7fff +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_and_b32 s4, s6, 0x7fff ; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; SI-NEXT: s_cmpk_lg_i32 s4, 0x7c00 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -658,11 +658,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_pattern_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -672,11 +672,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_pattern_0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -693,13 +693,13 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x7fff +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_and_b32 s4, s6, 0x7fff ; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; SI-NEXT: s_cmpk_lt_i32 s4, 0x7c00 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -710,11 +710,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_pattern_4_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -724,11 +724,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_pattern_4_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll index 626a22653f7def..15b5c136eb410a 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -20,115 +20,115 @@ declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32 define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; GFX1030-NEXT: v_mov_b32_e32 v1, s7 ; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1100-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen ; GFX1100-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen ; GFX12-NEXT: s_endpgm ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s4 -; G_SI-NEXT: v_mov_b32_e32 v1, s5 +; G_SI-NEXT: v_mov_b32_e32 v0, s6 +; G_SI-NEXT: v_mov_b32_e32 v1, s7 ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s7 ; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen +; G_GFX10-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s7 ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; G_GFX1030-NEXT: s_endpgm ; ; G_GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen ; G_GFX1100-NEXT: s_endpgm main_body: @@ -230,14 +230,14 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; SI-NEXT: s_load_dword s0, s[2:3], 0xf +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; SI-NEXT: s_load_dword s0, s[4:5], 0xf ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -246,7 +246,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -259,19 +259,19 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v1, s10 +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v1, s14 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -283,7 +283,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc @@ -295,27 +295,27 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s10 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ds_store_b32 v1, v0 ; GFX12-NEXT: s_endpgm ; ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 -; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_SI-NEXT: s_load_dword s0, s[2:3], 0xf +; G_SI-NEXT: v_mov_b32_e32 v0, s6 +; G_SI-NEXT: v_mov_b32_e32 v1, s7 +; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; G_SI-NEXT: s_load_dword s0, s[4:5], 0xf ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) @@ -324,14 +324,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX7-NEXT: s_load_dword s0, s[2:3], 0xf +; G_GFX7-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; G_GFX7-NEXT: s_load_dword s0, s[4:5], 0xf ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) @@ -341,13 +341,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: s_load_dword s0, s[2:3], 0x3c -; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; G_GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; G_GFX10-NEXT: s_load_dword s0, s[4:5], 0x3c ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) @@ -357,13 +358,13 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX1030-NEXT: s_load_dword s0, s[2:3], 0x3c -; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; G_GFX1030-NEXT: s_load_dword s0, s[4:5], 0x3c ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -373,12 +374,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; G_GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; G_GFX1100-NEXT: s_load_b32 s0, s[2:3], 0x3c -; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc +; G_GFX1100-NEXT: s_load_b32 s0, s[4:5], 0x3c ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) @@ -393,115 +394,115 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; GFX1030-NEXT: v_mov_b32_e32 v1, s7 ; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1100-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen ; GFX1100-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen ; GFX12-NEXT: s_endpgm ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s4 -; G_SI-NEXT: v_mov_b32_e32 v1, s5 +; G_SI-NEXT: v_mov_b32_e32 v0, s6 +; G_SI-NEXT: v_mov_b32_e32 v1, s7 ; G_SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s7 ; G_GFX7-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen +; G_GFX10-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s7 ; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; G_GFX1030-NEXT: s_endpgm ; ; G_GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen ; G_GFX1100-NEXT: s_endpgm main_body: @@ -603,7 +604,7 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -618,7 +619,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -633,19 +634,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 4 offen glc slc ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v1, v0, s[10:11] +; GFX10-NEXT: global_store_dword v1, v0, s[14:15] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -657,7 +658,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc @@ -668,7 +669,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s4, 4 @@ -680,7 +681,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -694,7 +695,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -708,19 +709,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 -; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s12 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s13 +; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 4 offen glc slc ; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) -; G_GFX10-NEXT: global_store_dword v1, v0, s[10:11] +; G_GFX10-NEXT: global_store_dword v1, v0, s[14:15] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -732,7 +733,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll index ba3735c1186105..bc2096093ab91b 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll @@ -18,105 +18,105 @@ declare float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float, ptr addrspace(8 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; GFX1030-NEXT: v_mov_b32_e32 v1, s7 ; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1100-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s4 -; G_SI-NEXT: v_mov_b32_e32 v1, s5 +; G_SI-NEXT: v_mov_b32_e32 v0, s6 +; G_SI-NEXT: v_mov_b32_e32 v1, s7 ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s7 ; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen +; G_GFX10-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s7 ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; G_GFX1030-NEXT: s_endpgm ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen ; G_GFX1100-NEXT: s_endpgm main_body: @@ -211,14 +211,14 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; SI-NEXT: s_load_dword s0, s[2:3], 0xf +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; SI-NEXT: s_load_dword s0, s[4:5], 0xf ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -227,7 +227,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -240,19 +240,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v1, s10 +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v1, s14 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -264,7 +264,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc @@ -275,14 +275,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 -; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_SI-NEXT: s_load_dword s0, s[2:3], 0xf +; G_SI-NEXT: v_mov_b32_e32 v0, s6 +; G_SI-NEXT: v_mov_b32_e32 v1, s7 +; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; G_SI-NEXT: s_load_dword s0, s[4:5], 0xf ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) @@ -291,14 +291,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX7-NEXT: s_load_dword s0, s[2:3], 0xf +; G_GFX7-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; G_GFX7-NEXT: s_load_dword s0, s[4:5], 0xf ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) @@ -308,13 +308,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: s_load_dword s0, s[2:3], 0x3c -; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; G_GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; G_GFX10-NEXT: s_load_dword s0, s[4:5], 0x3c ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) @@ -324,13 +325,13 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX1030-NEXT: s_load_dword s0, s[2:3], 0x3c -; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc +; G_GFX1030-NEXT: s_load_dword s0, s[4:5], 0x3c ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -340,12 +341,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; G_GFX1100-NEXT: s_load_b32 s0, s[2:3], 0x3c -; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc +; G_GFX1100-NEXT: s_load_b32 s0, s[4:5], 0x3c ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) @@ -361,105 +362,105 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; GFX1030-NEXT: v_mov_b32_e32 v1, s7 ; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1100-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s4 -; G_SI-NEXT: v_mov_b32_e32 v1, s5 +; G_SI-NEXT: v_mov_b32_e32 v0, s6 +; G_SI-NEXT: v_mov_b32_e32 v1, s7 ; G_SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s7 ; G_GFX7-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen +; G_GFX10-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s7 ; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; G_GFX1030-NEXT: s_endpgm ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen ; G_GFX1100-NEXT: s_endpgm main_body: @@ -554,7 +555,7 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -569,7 +570,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -584,19 +585,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 4 offen glc slc ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v1, v0, s[10:11] +; GFX10-NEXT: global_store_dword v1, v0, s[14:15] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -608,7 +609,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc @@ -619,7 +620,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -633,7 +634,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -647,19 +648,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 -; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s12 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s13 +; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 4 offen glc slc ; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) -; G_GFX10-NEXT: global_store_dword v1, v0, s[10:11] +; G_GFX10-NEXT: global_store_dword v1, v0, s[14:15] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -671,7 +672,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll index a169737493bcf7..c359b843e3cca1 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -11,7 +11,7 @@ declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp16_to_fp32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -47,7 +47,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX11-TRUE16-LABEL: test_convert_fp16_to_fp32: ; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -65,7 +65,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp32: ; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll index 865d64605f65c3..2520e6bf8e1a0c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -10,7 +10,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp16_to_fp64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -48,7 +48,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX11-TRUE16-LABEL: test_convert_fp16_to_fp64: ; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -68,7 +68,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp64: ; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll index 47b195a8f17067..520390cdc4621c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -10,7 +10,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp32_to_fp16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp32_to_fp16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX11-TRUE16-LABEL: test_convert_fp32_to_fp16: ; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -64,7 +64,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX11-FAKE16-LABEL: test_convert_fp32_to_fp16: ; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 973ca516679287..4aec2ffead4372 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -19,24 +19,24 @@ declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -66,14 +66,14 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -81,14 +81,14 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -102,24 +102,24 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -149,14 +149,14 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -164,14 +164,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -185,24 +185,24 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -232,14 +232,14 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -247,14 +247,14 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -268,24 +268,24 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -315,14 +315,14 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -330,14 +330,14 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -351,24 +351,24 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -398,14 +398,14 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -413,14 +413,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -434,24 +434,24 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -481,14 +481,14 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -496,14 +496,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -517,24 +517,24 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -564,14 +564,14 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -579,14 +579,14 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -600,24 +600,24 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -647,14 +647,14 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -662,14 +662,14 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -683,24 +683,24 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -730,14 +730,14 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -745,14 +745,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -766,24 +766,24 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -813,14 +813,14 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -828,14 +828,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -849,24 +849,24 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -896,14 +896,14 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -911,14 +911,14 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -932,24 +932,24 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen ; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -979,14 +979,14 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -994,14 +994,14 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX940-NEXT: s_load_dword s10, s[4:5], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 @@ -1015,7 +1015,7 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 @@ -1029,7 +1029,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -1046,7 +1046,7 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -1075,7 +1075,7 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 @@ -1089,7 +1089,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -1106,7 +1106,7 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 @@ -1118,7 +1118,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -1214,7 +1214,7 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1237,7 +1237,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -1254,7 +1254,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1268,7 +1268,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1285,7 +1285,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1314,7 +1314,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1328,7 +1328,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1426,12 +1426,12 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1449,7 +1449,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1466,10 +1466,10 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) { ; GFX90A-LABEL: local_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1477,10 +1477,10 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do ; ; GFX940-LABEL: local_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX940-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -1516,7 +1516,7 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1527,7 +1527,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s0 @@ -1542,7 +1542,7 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1553,7 +1553,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s0 @@ -1568,7 +1568,7 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1579,7 +1579,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll index 90c1759070a59a..829d7432df2cc0 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll @@ -16,101 +16,101 @@ declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dword s8, s[4:5], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_mov_b32_e32 v2, s6 +; GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; GFX1030-NEXT: v_mov_b32_e32 v1, s7 +; GFX1030-NEXT: v_mov_b32_e32 v2, s8 ; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; GFX1030-NEXT: s_endpgm ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_SI-NEXT: s_load_dword s8, s[4:5], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s4 -; G_SI-NEXT: v_mov_b32_e32 v1, s5 -; G_SI-NEXT: v_mov_b32_e32 v2, s6 +; G_SI-NEXT: v_mov_b32_e32 v0, s6 +; G_SI-NEXT: v_mov_b32_e32 v1, s7 +; G_SI-NEXT: v_mov_b32_e32 v2, s8 ; G_SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_GFX7-NEXT: s_load_dword s8, s[4:5], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[4:5], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s7 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 -; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen +; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX1030-NEXT: s_load_dword s8, s[4:5], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX1030-NEXT: s_endpgm main_body: @@ -253,101 +253,101 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dword s8, s[4:5], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_mov_b32_e32 v2, s6 +; GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; GFX1030-NEXT: v_mov_b32_e32 v1, s7 +; GFX1030-NEXT: v_mov_b32_e32 v2, s8 ; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; GFX1030-NEXT: s_endpgm ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_SI-NEXT: s_load_dword s8, s[4:5], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s4 -; G_SI-NEXT: v_mov_b32_e32 v1, s5 -; G_SI-NEXT: v_mov_b32_e32 v2, s6 +; G_SI-NEXT: v_mov_b32_e32 v0, s6 +; G_SI-NEXT: v_mov_b32_e32 v1, s7 +; G_SI-NEXT: v_mov_b32_e32 v2, s8 ; G_SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_GFX7-NEXT: s_load_dword s8, s[4:5], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[4:5], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s7 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 -; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen +; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX1030-NEXT: s_load_dword s8, s[4:5], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX1030-NEXT: s_endpgm main_body: @@ -424,7 +424,7 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -452,20 +452,20 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v2, s11 +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: v_mov_b32_e32 v2, s14 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v2, s15 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b64 v2, v[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -506,20 +506,20 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s10 -; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc -; G_GFX10-NEXT: v_mov_b32_e32 v2, s11 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s12 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s13 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s14 +; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v2, s15 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) ; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll index ff6700d10ff53d..0881cd84a4da2c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll @@ -16,101 +16,101 @@ declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dword s8, s[4:5], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_mov_b32_e32 v2, s6 +; GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; GFX1030-NEXT: v_mov_b32_e32 v1, s7 +; GFX1030-NEXT: v_mov_b32_e32 v2, s8 ; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; GFX1030-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_SI-NEXT: s_load_dword s8, s[4:5], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s4 -; G_SI-NEXT: v_mov_b32_e32 v1, s5 -; G_SI-NEXT: v_mov_b32_e32 v2, s6 +; G_SI-NEXT: v_mov_b32_e32 v0, s6 +; G_SI-NEXT: v_mov_b32_e32 v1, s7 +; G_SI-NEXT: v_mov_b32_e32 v2, s8 ; G_SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_GFX7-NEXT: s_load_dword s8, s[4:5], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[4:5], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s7 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 -; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen +; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX1030-NEXT: s_load_dword s8, s[4:5], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX1030-NEXT: s_endpgm main_body: @@ -253,101 +253,101 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GFX7-NEXT: s_load_dword s8, s[4:5], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1030-NEXT: s_load_dword s8, s[4:5], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_mov_b32_e32 v2, s6 +; GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; GFX1030-NEXT: v_mov_b32_e32 v1, s7 +; GFX1030-NEXT: v_mov_b32_e32 v2, s8 ; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; GFX1030-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_SI-NEXT: s_load_dword s8, s[4:5], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s4 -; G_SI-NEXT: v_mov_b32_e32 v1, s5 -; G_SI-NEXT: v_mov_b32_e32 v2, s6 +; G_SI-NEXT: v_mov_b32_e32 v0, s6 +; G_SI-NEXT: v_mov_b32_e32 v1, s7 +; G_SI-NEXT: v_mov_b32_e32 v2, s8 ; G_SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; G_GFX7-NEXT: s_load_dword s8, s[4:5], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[4:5], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s7 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 -; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen +; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; G_GFX1030-NEXT: s_load_dword s8, s[4:5], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s6 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen ; G_GFX1030-NEXT: s_endpgm main_body: @@ -424,7 +424,7 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -452,20 +452,20 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v2, s11 +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: v_mov_b32_e32 v2, s14 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v2, s15 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b64 v2, v[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -506,20 +506,20 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s10 -; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc -; G_GFX10-NEXT: v_mov_b32_e32 v2, s11 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s12 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s13 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s14 +; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v2, s15 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) ; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index 3571f3545ad1a1..162bf521ed6e6b 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -8,23 +8,23 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_i32_f32_e32 v0, s4 +; SI-NEXT: v_cvt_i32_f32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -47,23 +47,23 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i32_fabs: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_i32_f32_e64 v0, |s4| +; SI-NEXT: v_cvt_i32_f32_e64 v0, |s6| ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_i32_fabs: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e64 v0, |s4| +; VI-NEXT: v_cvt_i32_f32_e64 v0, |s2| +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -87,7 +87,7 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fp_to_sint_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -132,7 +132,7 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: fp_to_sint_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -147,7 +147,7 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: fp_to_sint_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -193,8 +193,8 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x2f800000 @@ -216,14 +216,14 @@ define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fp_to_sint_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s2, 0x2f800000 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s4, 0x2f800000 ; VI-NEXT: s_mov_b32 s5, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s4 -; VI-NEXT: v_mul_f32_e64 v1, |v0|, s2 +; VI-NEXT: v_trunc_f32_e32 v0, s2 +; VI-NEXT: v_mul_f32_e64 v1, |v0|, s4 ; VI-NEXT: v_floor_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v2, v1, s5, |v0| ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -294,7 +294,7 @@ entry: define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_sint_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 @@ -329,7 +329,7 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -452,8 +452,8 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %x) { ; SI-LABEL: fp_to_sint_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 @@ -509,24 +509,24 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % ; ; VI-LABEL: fp_to_sint_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s8, 0x2f800000 -; VI-NEXT: s_mov_b32 s9, 0xcf800000 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s4, 0x2f800000 +; VI-NEXT: s_mov_b32 s5, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s5 -; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8 +; VI-NEXT: v_trunc_f32_e32 v0, s9 +; VI-NEXT: v_mul_f32_e64 v1, |v0|, s4 ; VI-NEXT: v_floor_f32_e32 v1, v1 -; VI-NEXT: v_fma_f32 v2, v1, s9, |v0| +; VI-NEXT: v_fma_f32 v2, v1, s5, |v0| ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 -; VI-NEXT: v_trunc_f32_e32 v4, s4 +; VI-NEXT: v_trunc_f32_e32 v4, s8 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 -; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8 +; VI-NEXT: v_mul_f32_e64 v3, |v4|, s4 ; VI-NEXT: v_floor_f32_e32 v3, v3 ; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v3 -; VI-NEXT: v_fma_f32 v3, v3, s9, |v4| +; VI-NEXT: v_fma_f32 v3, v3, s5, |v4| ; VI-NEXT: v_xor_b32_e32 v2, v2, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v6, v3 ; VI-NEXT: v_xor_b32_e32 v1, v1, v0 @@ -534,22 +534,22 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % ; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v0, vcc ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v4 ; VI-NEXT: v_xor_b32_e32 v4, v5, v1 -; VI-NEXT: v_trunc_f32_e32 v5, s7 +; VI-NEXT: v_trunc_f32_e32 v5, s11 ; VI-NEXT: v_xor_b32_e32 v0, v6, v1 -; VI-NEXT: v_mul_f32_e64 v6, |v5|, s8 +; VI-NEXT: v_mul_f32_e64 v6, |v5|, s4 ; VI-NEXT: v_floor_f32_e32 v6, v6 ; VI-NEXT: v_cvt_u32_f32_e32 v7, v6 -; VI-NEXT: v_fma_f32 v6, v6, s9, |v5| +; VI-NEXT: v_fma_f32 v6, v6, s5, |v5| ; VI-NEXT: v_cvt_u32_f32_e32 v6, v6 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; VI-NEXT: v_ashrrev_i32_e32 v4, 31, v5 -; VI-NEXT: v_trunc_f32_e32 v8, s6 +; VI-NEXT: v_trunc_f32_e32 v8, s10 ; VI-NEXT: v_xor_b32_e32 v5, v6, v4 -; VI-NEXT: v_mul_f32_e64 v6, |v8|, s8 +; VI-NEXT: v_mul_f32_e64 v6, |v8|, s4 ; VI-NEXT: v_floor_f32_e32 v6, v6 ; VI-NEXT: v_cvt_u32_f32_e32 v9, v6 -; VI-NEXT: v_fma_f32 v6, v6, s9, |v8| +; VI-NEXT: v_fma_f32 v6, v6, s5, |v8| ; VI-NEXT: v_cvt_u32_f32_e32 v10, v6 ; VI-NEXT: v_xor_b32_e32 v7, v7, v4 ; VI-NEXT: v_sub_u32_e32 v6, vcc, v5, v4 @@ -737,24 +737,24 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, s4 +; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, s6 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, s4 +; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, s6 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -787,24 +787,24 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, |s4| +; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, |s6| ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, |s4| +; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, |s6| ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -838,23 +838,23 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_sint_f32_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_i32_f32_e32 v0, s4 +; SI-NEXT: v_cvt_i32_f32_e32 v0, s6 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_f32_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll index c6b4e129bacbe2..f4a130148459ac 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -8,23 +8,23 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_uint_f32_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_u32_f32_e32 v0, s4 +; SI-NEXT: v_cvt_u32_f32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_f32_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -47,7 +47,7 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float % define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -60,7 +60,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,7 +92,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: fp_to_uint_v4f32_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -152,8 +152,8 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x) { ; SI-LABEL: fp_to_uint_f32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0xcf800000 @@ -169,17 +169,17 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x ; ; VI-LABEL: fp_to_uint_f32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s2, 0xcf800000 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xcf800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s4 +; VI-NEXT: v_trunc_f32_e32 v0, s2 ; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; VI-NEXT: v_floor_f32_e32 v2, v1 -; VI-NEXT: v_fma_f32 v0, v2, s2, v0 +; VI-NEXT: v_fma_f32 v0, v2, s3, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -240,7 +240,7 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0xcf800000 @@ -264,7 +264,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -376,8 +376,8 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x float> %x) { ; SI-LABEL: fp_to_uint_v4f32_to_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0xcf800000 @@ -412,13 +412,13 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s2, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s5 -; VI-NEXT: v_trunc_f32_e32 v4, s4 +; VI-NEXT: v_trunc_f32_e32 v0, s9 +; VI-NEXT: v_trunc_f32_e32 v4, s8 ; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4 ; VI-NEXT: v_floor_f32_e32 v5, v1 @@ -426,10 +426,10 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x ; VI-NEXT: v_fma_f32 v0, v5, s2, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v2, v0 ; VI-NEXT: v_fma_f32 v0, v6, s2, v4 -; VI-NEXT: v_trunc_f32_e32 v4, s7 +; VI-NEXT: v_trunc_f32_e32 v4, s11 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v5 ; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; VI-NEXT: v_trunc_f32_e32 v8, s6 +; VI-NEXT: v_trunc_f32_e32 v8, s10 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v6 ; VI-NEXT: v_floor_f32_e32 v6, v5 ; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v8 @@ -619,24 +619,24 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s4 +; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s6 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s4 +; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s6 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -669,24 +669,24 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s4| +; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s6| ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s4| +; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s6| ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -720,23 +720,23 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_u32_f32_e32 v0, s4 +; SI-NEXT: v_cvt_u32_f32_e32 v0, s6 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_f32_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index d2e6b9266fa5c7..79b5eca0703029 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; SI-LABEL: fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -24,45 +24,27 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: fpext_f16_to_f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fpext_f16_to_f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fpext_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -80,7 +62,7 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; ; GFX11-FAKE16-LABEL: fpext_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -107,7 +89,7 @@ entry: define amdgpu_kernel void @fpext_f16_to_f64( ; SI-LABEL: fpext_f16_to_f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -124,47 +106,28 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: fpext_f16_to_f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fpext_f16_to_f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fpext_f16_to_f64: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fpext_f16_to_f64: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -184,7 +147,7 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; ; GFX11-FAKE16-LABEL: fpext_f16_to_f64: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -213,7 +176,7 @@ entry: define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; SI-LABEL: fpext_v2f16_to_v2f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -231,47 +194,28 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: fpext_v2f16_to_v2f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fpext_v2f16_to_v2f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fpext_v2f16_to_v2f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX89-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -292,7 +236,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; ; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -322,7 +266,7 @@ entry: define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; SI-LABEL: fpext_v2f16_to_v2f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -342,51 +286,30 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: fpext_v2f16_to_v2f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fpext_v2f16_to_v2f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fpext_v2f16_to_v2f64: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX89-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; GFX89-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f64: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -410,7 +333,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; ; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f64: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -443,35 +366,35 @@ entry: define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) { ; SI-LABEL: s_fneg_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX89-LABEL: s_fneg_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX89-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX89-NEXT: s_mov_b32 s2, -1 ; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: s_fneg_fpext_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -480,12 +403,12 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) ; GFX11-FAKE16-LABEL: s_fneg_fpext_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-FAKE16-NEXT: s_endpgm entry: @@ -499,7 +422,7 @@ entry: define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; SI-LABEL: fneg_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -515,45 +438,27 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: fneg_fpext_f16_to_f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fneg_fpext_f16_to_f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fneg_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fneg_fpext_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -571,7 +476,7 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; ; GFX11-FAKE16-LABEL: fneg_fpext_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -599,7 +504,7 @@ entry: define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; SI-LABEL: fabs_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -615,45 +520,27 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: fabs_fpext_f16_to_f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fabs_fpext_f16_to_f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fabs_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fabs_fpext_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -671,7 +558,7 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; ; GFX11-FAKE16-LABEL: fabs_fpext_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -699,7 +586,7 @@ entry: define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; SI-LABEL: fneg_fabs_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -715,45 +602,27 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: fneg_fabs_fpext_f16_to_f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fneg_fabs_fpext_f16_to_f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fneg_fabs_fpext_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -771,7 +640,7 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; ; GFX11-FAKE16-LABEL: fneg_fabs_fpext_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -802,7 +671,7 @@ entry: define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; SI-LABEL: fneg_multi_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -822,53 +691,31 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; VI-LABEL: fneg_multi_use_fpext_f16_to_f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fneg_multi_use_fpext_f16_to_f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; GFX89-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fneg_multi_use_fpext_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -892,7 +739,7 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; ; GFX11-FAKE16-LABEL: fneg_multi_use_fpext_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -925,7 +772,7 @@ entry: define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -947,53 +794,31 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; VI-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; VI-NEXT: v_mul_f16_e64 v0, -v0, v0 -; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; GFX9-NEXT: v_mul_f16_e64 v0, -v0, v0 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; GFX89-NEXT: v_mul_f16_e64 v0, -v0, v0 +; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -1017,7 +842,7 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX11-FAKE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -1051,7 +876,7 @@ entry: define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; SI-LABEL: fabs_multi_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1071,53 +896,31 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; VI-LABEL: fabs_multi_use_fpext_f16_to_f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e64 v1, |v0| -; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fabs_multi_use_fpext_f16_to_f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e64 v1, |v0| -; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; GFX89-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fabs_multi_use_fpext_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -1141,7 +944,7 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; ; GFX11-FAKE16-LABEL: fabs_multi_use_fpext_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -1174,7 +977,7 @@ entry: define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; SI-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1196,53 +999,31 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; VI-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e64 v1, |v0| -; VI-NEXT: v_mul_f16_e64 v0, |v0|, v0 -; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e64 v1, |v0| -; GFX9-NEXT: v_mul_f16_e64 v0, |v0|, v0 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; GFX89-NEXT: v_mul_f16_e64 v0, |v0|, v0 +; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -1266,7 +1047,7 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; ; GFX11-FAKE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -1300,7 +1081,7 @@ entry: define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; SI-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1320,53 +1101,31 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; VI-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e64 v1, -|v0| -; VI-NEXT: v_or_b32_e32 v0, 0x8000, v0 -; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -|v0| -; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; GFX89-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -1390,7 +1149,7 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; ; GFX11-FAKE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -1424,7 +1183,7 @@ entry: define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1446,53 +1205,31 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; VI-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e64 v1, -|v0| -; VI-NEXT: v_mul_f16_e64 v0, -|v0|, v0 -; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -|v0| -; GFX9-NEXT: v_mul_f16_e64 v0, -|v0|, v0 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; GFX89-NEXT: v_mul_f16_e64 v0, -|v0|, v0 +; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -1516,7 +1253,7 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX11-FAKE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -1551,3 +1288,6 @@ entry: declare half @llvm.fabs.f16(half) #1 attributes #1 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX9: {{.*}} +; VI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index 9e92a89501cf6b..407626a8e92d57 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; SI-LABEL: fptosi_f16_to_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; ; VI-LABEL: fptosi_f16_to_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -45,7 +45,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; ; GFX11-TRUE16-LABEL: fptosi_f16_to_i16: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -63,7 +63,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; ; GFX11-FAKE16-LABEL: fptosi_f16_to_i16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -90,7 +90,7 @@ entry: define amdgpu_kernel void @fptosi_f16_to_i32( ; SI-LABEL: fptosi_f16_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -109,7 +109,7 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; ; VI-LABEL: fptosi_f16_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -128,7 +128,7 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; ; GFX11-TRUE16-LABEL: fptosi_f16_to_i32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -148,7 +148,7 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; ; GFX11-FAKE16-LABEL: fptosi_f16_to_i32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ entry: define amdgpu_kernel void @fptosi_f16_to_i64( ; SI-LABEL: fptosi_f16_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -200,7 +200,7 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; ; VI-LABEL: fptosi_f16_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -220,7 +220,7 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; ; GFX11-TRUE16-LABEL: fptosi_f16_to_i64: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -241,7 +241,7 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; ; GFX11-FAKE16-LABEL: fptosi_f16_to_i64: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -271,7 +271,7 @@ entry: define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; SI-LABEL: fptosi_v2f16_to_v2i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -296,7 +296,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; ; VI-LABEL: fptosi_v2f16_to_v2i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -316,7 +316,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; ; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i16: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -343,7 +343,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; ; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -376,7 +376,7 @@ entry: define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; SI-LABEL: fptosi_v2f16_to_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -398,7 +398,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; ; VI-LABEL: fptosi_v2f16_to_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -419,7 +419,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; ; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -443,7 +443,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; ; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -479,7 +479,7 @@ entry: define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; SI-LABEL: fptosi_v2f16_to_v2i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -503,7 +503,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; ; VI-LABEL: fptosi_v2f16_to_v2i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -526,7 +526,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; ; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i64: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -553,7 +553,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; ; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i64: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -589,12 +589,12 @@ entry: define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; SI-LABEL: fptosi_f16_to_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, -1.0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -602,12 +602,12 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: fptosi_f16_to_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], -1.0, s4 +; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], -1.0, s6 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -615,12 +615,12 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-TRUE16-LABEL: fptosi_f16_to_i1: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, -1.0, v0 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo @@ -630,11 +630,11 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-FAKE16-LABEL: fptosi_f16_to_i1: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s4 +; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll index 804208998f9e8f..ff00633cad4920 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; SI-LABEL: fptoui_f16_to_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; ; VI-LABEL: fptoui_f16_to_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -45,7 +45,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; ; GFX11-TRUE16-LABEL: fptoui_f16_to_i16: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -63,7 +63,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; ; GFX11-FAKE16-LABEL: fptoui_f16_to_i16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -90,7 +90,7 @@ entry: define amdgpu_kernel void @fptoui_f16_to_i32( ; SI-LABEL: fptoui_f16_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -109,7 +109,7 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; ; VI-LABEL: fptoui_f16_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -128,7 +128,7 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; ; GFX11-TRUE16-LABEL: fptoui_f16_to_i32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -148,7 +148,7 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; ; GFX11-FAKE16-LABEL: fptoui_f16_to_i32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ entry: define amdgpu_kernel void @fptoui_f16_to_i64( ; SI-LABEL: fptoui_f16_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -200,7 +200,7 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; ; VI-LABEL: fptoui_f16_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -220,7 +220,7 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; ; GFX11-TRUE16-LABEL: fptoui_f16_to_i64: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -241,7 +241,7 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; ; GFX11-FAKE16-LABEL: fptoui_f16_to_i64: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -271,7 +271,7 @@ entry: define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; SI-LABEL: fptoui_v2f16_to_v2i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -295,7 +295,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; ; VI-LABEL: fptoui_v2f16_to_v2i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -315,7 +315,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; ; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i16: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -342,7 +342,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; ; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -375,7 +375,7 @@ entry: define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; SI-LABEL: fptoui_v2f16_to_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -397,7 +397,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; ; VI-LABEL: fptoui_v2f16_to_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -418,7 +418,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; ; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -442,7 +442,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; ; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -478,7 +478,7 @@ entry: define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; SI-LABEL: fptoui_v2f16_to_v2i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -502,7 +502,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; ; VI-LABEL: fptoui_v2f16_to_v2i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -525,7 +525,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; ; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i64: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -551,7 +551,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; ; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i64: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -586,12 +586,12 @@ entry: define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; SI-LABEL: fptoui_f16_to_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 1.0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -600,12 +600,12 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: fptoui_f16_to_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], 1.0, s4 +; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], 1.0, s6 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -613,12 +613,12 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-TRUE16-LABEL: fptoui_f16_to_i1: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, 1.0, v0 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo @@ -628,11 +628,11 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-FAKE16-LABEL: fptoui_f16_to_i1: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s4 +; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 0ea412a6b6f131..0005f1179a5d28 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -41,7 +41,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -59,7 +59,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -71,37 +71,37 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 -; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0 -; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -119,7 +119,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -140,7 +140,7 @@ entry: define amdgpu_kernel void @fptrunc_f64_to_f16( ; SI-SDAG-LABEL: fptrunc_f64_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -159,7 +159,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; SI-GISEL-LABEL: fptrunc_f64_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -172,7 +172,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; VI-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -191,7 +191,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; VI-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -204,39 +204,39 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX9-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 ; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 -; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -256,7 +256,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX11-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -279,7 +279,7 @@ entry: define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; SI-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -300,7 +300,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; SI-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -315,7 +315,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; VI-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -335,7 +335,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; VI-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -350,41 +350,41 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 ; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 -; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3 +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -405,7 +405,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX11-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -429,7 +429,7 @@ entry: define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -452,7 +452,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -469,7 +469,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -491,7 +491,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -507,45 +507,45 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 ; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 -; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] ; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] -; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -570,7 +570,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 @@ -597,7 +597,7 @@ entry: define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -615,7 +615,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -627,7 +627,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -645,7 +645,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -657,37 +657,37 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 -; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s0 -; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2 +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -705,7 +705,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -727,7 +727,7 @@ entry: define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -745,7 +745,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -757,7 +757,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -775,7 +775,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -787,37 +787,37 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 -; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0| -; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0| -; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2| +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -835,7 +835,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -857,7 +857,7 @@ entry: define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -875,7 +875,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -887,7 +887,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -905,7 +905,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -917,37 +917,37 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 -; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0| -; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s0| -; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2| +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -965,7 +965,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -988,7 +988,7 @@ entry: define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; SI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1006,7 +1006,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; SI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -1018,7 +1018,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; VI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1036,7 +1036,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; VI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1048,37 +1048,37 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 -; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0 -; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -1123,7 +1123,7 @@ entry: define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; SI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1141,7 +1141,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; SI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -1153,7 +1153,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; VI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1171,7 +1171,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; VI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1183,37 +1183,37 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 -; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0| -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0| -; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2| +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX11-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -1259,7 +1259,7 @@ entry: define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; SI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1278,7 +1278,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; SI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -1291,7 +1291,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; VI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1310,7 +1310,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; VI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1323,39 +1323,39 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s3 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s7 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 ; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, s4 -; GFX9-SDAG-NEXT: s_mov_b32 s1, s5 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX9-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1375,7 +1375,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 7a18e2ef7b4a84..3d3e8bea7e33ef 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fptrunc_f64_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; VI-SDAG-LABEL: fptrunc_f64_to_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -40,7 +40,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; VI-GISEL-LABEL: fptrunc_f64_to_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; VI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -50,7 +50,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX10-SDAG-LABEL: fptrunc_f64_to_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -60,7 +60,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX10-GISEL-LABEL: fptrunc_f64_to_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 @@ -70,7 +70,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX11-SDAG-LABEL: fptrunc_f64_to_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -80,7 +80,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX11-GISEL-LABEL: fptrunc_f64_to_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 @@ -95,7 +95,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fptrunc_f64_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -155,7 +155,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-SAFE-SDAG: ; %bb.0: -; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-SAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -214,7 +214,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-SAFE-GISEL: ; %bb.0: -; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; VI-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8 @@ -266,7 +266,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-UNSAFE-SDAG: ; %bb.0: -; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 @@ -277,7 +277,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-UNSAFE-GISEL: ; %bb.0: -; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 @@ -288,7 +288,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX10-SAFE-SDAG: ; %bb.0: -; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff ; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 @@ -344,7 +344,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX10-SAFE-GISEL: ; %bb.0: -; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 @@ -396,7 +396,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX10-UNSAFE-SDAG: ; %bb.0: -; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -407,7 +407,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX10-UNSAFE-GISEL: ; %bb.0: -; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 @@ -418,7 +418,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-SAFE-SDAG: ; %bb.0: -; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff ; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 @@ -483,7 +483,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-SAFE-GISEL: ; %bb.0: -; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 @@ -540,7 +540,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-UNSAFE-SDAG: ; %bb.0: -; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -552,7 +552,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-UNSAFE-GISEL: ; %bb.0: -; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 @@ -570,89 +570,89 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: fptrunc_v2f64_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; SI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; VI-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; VI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; VI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-GISEL-NEXT: s_mov_b32 s2, -1 -; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s6, -1 +; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; VI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; VI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: fptrunc_v2f64_to_v2f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX10-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: fptrunc_v2f64_to_v2f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX10-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX11-SDAG-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX11-GISEL-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc <2 x double> %in to <2 x float> @@ -663,56 +663,56 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x double> %in) { ; SI-LABEL: fptrunc_v3f64_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x15 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; SI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; SI-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; SI-NEXT: v_cvt_f32_f64_e32 v2, s[4:5] ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-SDAG-LABEL: fptrunc_v3f64_to_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x54 -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; VI-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[6:7] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s6, -1 +; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: fptrunc_v3f64_to_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] ; VI-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: fptrunc_v3f64_to_v3f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[6:7] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -722,27 +722,27 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; GFX10-GISEL-LABEL: fptrunc_v3f64_to_v3f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] ; GFX10-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_v3f64_to_v3f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x54 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[6:7] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -752,14 +752,14 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] ; GFX11-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc <3 x double> %in to <3 x float> @@ -770,103 +770,103 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: fptrunc_v4f64_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] -; SI-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; SI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; SI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; SI-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; SI-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-SDAG-LABEL: fptrunc_v4f64_to_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] ; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: fptrunc_v4f64_to_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] ; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: fptrunc_v4f64_to_v4f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] ; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: fptrunc_v4f64_to_v4f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] ; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_v4f64_to_v4f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] ; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] ; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc <4 x double> %in to <4 x float> @@ -877,57 +877,57 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x double> %in) { ; SI-LABEL: fptrunc_v8f64_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] -; SI-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; SI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; SI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; SI-NEXT: v_cvt_f32_f64_e32 v7, s[18:19] -; SI-NEXT: v_cvt_f32_f64_e32 v6, s[16:17] -; SI-NEXT: v_cvt_f32_f64_e32 v5, s[14:15] -; SI-NEXT: v_cvt_f32_f64_e32 v4, s[12:13] +; SI-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; SI-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; SI-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] +; SI-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; SI-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; SI-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-SDAG-LABEL: fptrunc_v8f64_to_v8f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[18:19] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[16:17] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[14:15] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[12:13] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] ; VI-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: fptrunc_v8f64_to_v8f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[12:13] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[14:15] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[16:17] -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[18:19] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] ; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-GISEL-NEXT: s_endpgm @@ -935,19 +935,19 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX10-SDAG-LABEL: fptrunc_v8f64_to_v8f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[18:19] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[16:17] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[14:15] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[12:13] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] ; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX10-SDAG-NEXT: s_endpgm @@ -955,19 +955,19 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX10-GISEL-LABEL: fptrunc_v8f64_to_v8f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[12:13] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[14:15] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[16:17] -; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[18:19] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] ; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GFX10-GISEL-NEXT: s_endpgm @@ -975,19 +975,19 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX11-SDAG-LABEL: fptrunc_v8f64_to_v8f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0x64 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[18:19] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[16:17] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[14:15] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[12:13] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 ; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 @@ -996,19 +996,19 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0x64 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[12:13] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[14:15] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[16:17] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[18:19] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-GISEL-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index a92015269f8ce9..5febd5256e7949 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -10,23 +10,23 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 @@ -51,21 +51,21 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; CI-LABEL: frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s0 +; CI-NEXT: s_mov_b32 s9, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 +; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -92,19 +92,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 8 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, 8 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: flat_load_ushort v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -126,12 +126,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; GFX9-LABEL: frem_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -148,19 +148,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -177,19 +177,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: frem_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] -; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -213,19 +213,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 -; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7] -; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 ; GFX1150-NEXT: s_waitcnt vmcnt(1) ; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX1150-NEXT: s_waitcnt vmcnt(0) @@ -250,7 +250,7 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 @@ -264,23 +264,23 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_rcp_f32_e32 v2, v1 @@ -293,21 +293,21 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: fast_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s10 ; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 +; CI-NEXT: s_mov_b32 s8, s0 +; CI-NEXT: s_mov_b32 s9, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s10 +; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_rcp_f32_e32 v2, v1 @@ -322,19 +322,19 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 8 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, 8 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: flat_load_ushort v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f16_e32 v3, v2 ; VI-NEXT: v_mul_f16_e32 v3, v4, v3 @@ -345,48 +345,48 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: fast_frem_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v3, v2 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fast_frem_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v3, v2 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fast_frem_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] -; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e32 v3, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -394,19 +394,19 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 -; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: fast_frem_f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7] -; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 ; GFX1150-NEXT: s_waitcnt vmcnt(0) ; GFX1150-NEXT: v_rcp_f16_e32 v3, v2 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -415,7 +415,7 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 ; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 @@ -429,23 +429,23 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_rcp_f32_e32 v2, v1 @@ -458,21 +458,21 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s10 ; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 +; CI-NEXT: s_mov_b32 s8, s0 +; CI-NEXT: s_mov_b32 s9, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s10 +; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_rcp_f32_e32 v2, v1 @@ -487,19 +487,19 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 8 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, 8 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: flat_load_ushort v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f16_e32 v3, v2 ; VI-NEXT: v_mul_f16_e32 v3, v4, v3 @@ -510,48 +510,48 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: unsafe_frem_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v3, v2 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: unsafe_frem_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v3, v2 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: unsafe_frem_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] -; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e32 v3, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -559,19 +559,19 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 -; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: unsafe_frem_f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7] -; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 ; GFX1150-NEXT: s_waitcnt vmcnt(0) ; GFX1150-NEXT: v_rcp_f16_e32 v3, v2 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -580,7 +580,7 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 ; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 @@ -594,21 +594,21 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 ; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 @@ -630,21 +630,21 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; CI-LABEL: frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s0 +; CI-NEXT: s_mov_b32 s9, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 @@ -666,19 +666,19 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: flat_load_dword v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4 ; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4 @@ -700,14 +700,14 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; GFX9-LABEL: frem_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1 +; GFX9-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, v1 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 ; GFX9-NEXT: v_rcp_f32_e32 v5, v4 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -722,21 +722,21 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v4, s2, v2, v2, v1 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 ; GFX10-NEXT: v_rcp_f32_e32 v5, v4 ; GFX10-NEXT: s_denorm_mode 15 @@ -751,19 +751,19 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 @@ -786,19 +786,19 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 ; GFX1150-NEXT: s_waitcnt vmcnt(0) ; GFX1150-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 ; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 @@ -822,7 +822,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 @@ -836,21 +836,21 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 @@ -861,21 +861,21 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: fast_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s0 +; CI-NEXT: s_mov_b32 s9, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_rcp_f32_e32 v2, v1 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2 @@ -886,19 +886,19 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: flat_load_dword v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f32_e32 v3, v2 ; VI-NEXT: v_mul_f32_e32 v3, v4, v3 @@ -909,48 +909,48 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: fast_frem_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f32_e32 v3, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fast_frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fast_frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -958,19 +958,19 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: fast_frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 ; GFX1150-NEXT: s_waitcnt vmcnt(0) ; GFX1150-NEXT: v_rcp_f32_e32 v3, v2 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -979,7 +979,7 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 @@ -993,21 +993,21 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f32_e32 v2, v1 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2 @@ -1018,21 +1018,21 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: unsafe_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s0 +; CI-NEXT: s_mov_b32 s9, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_rcp_f32_e32 v2, v1 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2 @@ -1043,19 +1043,19 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: flat_load_dword v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f32_e32 v3, v2 ; VI-NEXT: v_mul_f32_e32 v3, v4, v3 @@ -1066,48 +1066,48 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: unsafe_frem_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f32_e32 v3, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: unsafe_frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: unsafe_frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -1115,19 +1115,19 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: unsafe_frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 ; GFX1150-NEXT: s_waitcnt vmcnt(0) ; GFX1150-NEXT: v_rcp_f32_e32 v3, v2 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1136,7 +1136,7 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 @@ -1150,21 +1150,21 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -1208,21 +1208,21 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; CI-LABEL: frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s0 +; CI-NEXT: s_mov_b32 s9, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] ; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -1243,17 +1243,17 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] ; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] @@ -1274,14 +1274,14 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; GFX9-LABEL: frem_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -1295,21 +1295,21 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX9-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[4:5], s2, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -1322,19 +1322,19 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v12, s[6:7] -; GFX11-NEXT: global_load_b64 v[2:3], v12, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v12, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v12, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1355,19 +1355,19 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX11-NEXT: global_store_b64 v12, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v12, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v12, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b64 v[0:1], v12, s[6:7] -; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[0:1] +; GFX1150-NEXT: global_load_b64 v[0:1], v12, s[2:3] +; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[4:5] ; GFX1150-NEXT: s_waitcnt vmcnt(0) ; GFX1150-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) @@ -1388,7 +1388,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[4:5] +; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %r0 = load double, ptr addrspace(1) %in1, align 8 @@ -1401,21 +1401,21 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1452,21 +1452,21 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: fast_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s0 +; CI-NEXT: s_mov_b32 s9, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1483,17 +1483,17 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -1510,12 +1510,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: fast_frem_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1527,19 +1527,19 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fast_frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1551,19 +1551,19 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fast_frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7] -; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v10, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -1580,19 +1580,19 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: fast_frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v10, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[6:7] -; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[0:1] +; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[2:3] +; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[4:5] ; GFX1150-NEXT: s_waitcnt vmcnt(0) ; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1609,7 +1609,7 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[4:5] +; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %r0 = load double, ptr addrspace(1) %in1, align 8 @@ -1622,21 +1622,21 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1673,21 +1673,21 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: unsafe_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s8, s0 +; CI-NEXT: s_mov_b32 s9, s1 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1704,17 +1704,17 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -1731,12 +1731,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: unsafe_frem_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1748,19 +1748,19 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: unsafe_frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1772,19 +1772,19 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: unsafe_frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7] -; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v10, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -1801,19 +1801,19 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: unsafe_frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v10, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[6:7] -; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[0:1] +; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[2:3] +; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[4:5] ; GFX1150-NEXT: s_waitcnt vmcnt(0) ; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1830,7 +1830,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[4:5] +; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %r0 = load double, ptr addrspace(1) %in1, align 8 @@ -1843,25 +1843,25 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -1906,21 +1906,21 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s10, s2 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s0, s8 +; CI-NEXT: s_mov_b32 s1, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s2 ; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 +; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; CI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1969,19 +1969,19 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: flat_load_dword v4, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 @@ -2021,12 +2021,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2060,19 +2060,19 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_trunc_f16_e32 v1, v1 ; GFX9-NEXT: v_fma_f16 v1, -v1, v6, v4 ; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2106,19 +2106,19 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_trunc_f16_e32 v4, v4 ; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1 ; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: frem_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2166,19 +2166,19 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fma_f16 v1, -v1, v6, v4 ; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v2f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 ; GFX1150-NEXT: s_waitcnt vmcnt(1) ; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX1150-NEXT: s_waitcnt vmcnt(0) @@ -2229,7 +2229,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_fmac_f16_e32 v1, v4, v2 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v3 -; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 @@ -2243,20 +2243,20 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v4f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2264,7 +2264,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2346,20 +2346,20 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s10, s2 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s0, s8 +; CI-NEXT: s_mov_b32 s1, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s2 ; CI-NEXT: s_mov_b32 s11, s3 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -2367,7 +2367,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -2449,19 +2449,19 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 @@ -2533,12 +2533,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2603,19 +2603,19 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_trunc_f16_e32 v0, v0 ; GFX9-NEXT: v_fma_f16 v0, -v0, v7, v5 ; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_v4f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2680,19 +2680,19 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_trunc_f16_e32 v5, v5 ; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0 ; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: frem_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] -; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2785,19 +2785,19 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_fma_f16 v0, -v0, v7, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v3, v0 -; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v4f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7] -; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 +; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[2:3] +; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 ; GFX1150-NEXT: s_waitcnt vmcnt(1) ; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX1150-NEXT: s_waitcnt vmcnt(0) @@ -2895,7 +2895,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_fmac_f16_e32 v1, v5, v3 ; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 @@ -2909,21 +2909,21 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 @@ -2960,21 +2960,21 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s10, s2 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s0, s8 +; CI-NEXT: s_mov_b32 s1, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s2 ; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 +; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 @@ -3011,19 +3011,19 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, 32 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 ; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 @@ -3060,14 +3060,14 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 +; GFX9-NEXT: v_div_scale_f32 v6, s[2:3], v3, v3, v1 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -3082,7 +3082,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_fma_f32 v1, -v5, v3, v1 -; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0 +; GFX9-NEXT: v_div_scale_f32 v5, s[2:3], v2, v2, v0 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; GFX9-NEXT: v_rcp_f32_e32 v6, v5 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -3097,21 +3097,21 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_fma_f32 v0, -v3, v2, v0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1 +; GFX10-NEXT: v_div_scale_f32 v6, s2, v3, v3, v1 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6 ; GFX10-NEXT: s_denorm_mode 15 @@ -3126,7 +3126,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5 ; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1 -; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0 +; GFX10-NEXT: v_div_scale_f32 v5, s2, v2, v2, v0 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 ; GFX10-NEXT: v_rcp_f32_e32 v6, v5 ; GFX10-NEXT: s_denorm_mode 15 @@ -3141,19 +3141,19 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 ; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: frem_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] -; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 ; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 @@ -3197,19 +3197,19 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v0, -v3, v2, v0 -; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v2f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7] -; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 +; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[2:3] +; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 ; GFX1150-NEXT: s_waitcnt vmcnt(0) ; GFX1150-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 ; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 @@ -3255,7 +3255,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX1150-NEXT: v_fmac_f32_e32 v0, v3, v2 -; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4 @@ -3269,21 +3269,21 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 @@ -3350,21 +3350,21 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s10, s2 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s0, s8 +; CI-NEXT: s_mov_b32 s1, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s2 ; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 +; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 ; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 @@ -3431,19 +3431,19 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s0, 64 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v8, s0 +; VI-NEXT: s_add_u32 s0, s4, 64 +; VI-NEXT: v_mov_b32_e32 v9, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 ; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 @@ -3510,14 +3510,14 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 +; GFX9-NEXT: v_div_scale_f32 v10, s[2:3], v7, v7, v3 ; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 ; GFX9-NEXT: v_rcp_f32_e32 v11, v10 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -3532,7 +3532,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9 ; GFX9-NEXT: v_fma_f32 v3, -v9, v7, v3 -; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2 +; GFX9-NEXT: v_div_scale_f32 v9, s[2:3], v6, v6, v2 ; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; GFX9-NEXT: v_rcp_f32_e32 v10, v9 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -3547,7 +3547,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 ; GFX9-NEXT: v_fma_f32 v2, -v7, v6, v2 -; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 +; GFX9-NEXT: v_div_scale_f32 v7, s[2:3], v5, v5, v1 ; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; GFX9-NEXT: v_rcp_f32_e32 v9, v7 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -3562,7 +3562,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_fma_f32 v1, -v6, v5, v1 -; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 +; GFX9-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, v0 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -3577,21 +3577,21 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_fma_f32 v0, -v5, v4, v0 -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3 +; GFX10-NEXT: v_div_scale_f32 v10, s2, v7, v7, v3 ; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 ; GFX10-NEXT: v_rcp_f32_e32 v11, v10 ; GFX10-NEXT: s_denorm_mode 15 @@ -3606,7 +3606,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX10-NEXT: v_trunc_f32_e32 v9, v9 ; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3 -; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2 +; GFX10-NEXT: v_div_scale_f32 v9, s2, v6, v6, v2 ; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 ; GFX10-NEXT: v_rcp_f32_e32 v10, v9 ; GFX10-NEXT: s_denorm_mode 15 @@ -3621,7 +3621,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX10-NEXT: v_trunc_f32_e32 v7, v7 ; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2 -; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1 +; GFX10-NEXT: v_div_scale_f32 v7, s2, v5, v5, v1 ; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 ; GFX10-NEXT: v_rcp_f32_e32 v9, v7 ; GFX10-NEXT: s_denorm_mode 15 @@ -3636,7 +3636,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX10-NEXT: v_trunc_f32_e32 v6, v6 ; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1 -; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0 +; GFX10-NEXT: v_div_scale_f32 v6, s2, v4, v4, v0 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6 ; GFX10-NEXT: s_denorm_mode 15 @@ -3651,19 +3651,19 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5 ; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: frem_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] -; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64 +; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v8, s[4:5] offset:64 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 ; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 @@ -3749,19 +3749,19 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v5, v5 ; GFX11-NEXT: v_fma_f32 v0, -v5, v4, v0 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v4f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v8, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b128 v[0:3], v8, s[6:7] -; GFX1150-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64 +; GFX1150-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX1150-NEXT: global_load_b128 v[4:7], v8, s[4:5] offset:64 ; GFX1150-NEXT: s_waitcnt vmcnt(0) ; GFX1150-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 ; GFX1150-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 @@ -3851,7 +3851,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX1150-NEXT: v_fmac_f32_e32 v0, v5, v4 -; GFX1150-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX1150-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4 @@ -3865,21 +3865,21 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] ; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] @@ -3957,21 +3957,21 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s10, s2 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 ; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s0, s8 +; CI-NEXT: s_mov_b32 s1, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s2 ; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 +; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3] ; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] @@ -4006,19 +4006,19 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s0, 64 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v8, s0 +; VI-NEXT: s_add_u32 s0, s4, 64 +; VI-NEXT: v_mov_b32_e32 v9, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3] ; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] @@ -4053,14 +4053,14 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v2f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] +; GFX9-NEXT: v_div_scale_f64 v[8:9], s[2:3], v[6:7], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] @@ -4074,7 +4074,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; GFX9-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] ; GFX9-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX9-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_div_scale_f64 v[6:7], s[2:3], v[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] @@ -4088,21 +4088,21 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; GFX9-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] ; GFX9-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: frem_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[8:9], s2, v[6:7], v[6:7], v[2:3] ; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] @@ -4115,7 +4115,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] ; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[6:7], s2, v[4:5], v[4:5], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] @@ -4128,19 +4128,19 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: frem_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v16, s[6:7] -; GFX11-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64 +; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v16, s[4:5] offset:64 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4180,19 +4180,19 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; GFX11-NEXT: global_store_b128 v16, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX1150-LABEL: frem_v2f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v16, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b128 v[0:3], v16, s[6:7] -; GFX1150-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64 +; GFX1150-NEXT: global_load_b128 v[0:3], v16, s[2:3] +; GFX1150-NEXT: global_load_b128 v[4:7], v16, s[4:5] offset:64 ; GFX1150-NEXT: s_waitcnt vmcnt(0) ; GFX1150-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) @@ -4231,7 +4231,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; GFX1150-NEXT: global_store_b128 v16, v[0:3], s[4:5] +; GFX1150-NEXT: global_store_b128 v16, v[0:3], s[0:1] ; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 8ad20d3da9a96c..43caa4c739fb3e 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -13,49 +13,49 @@ declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind rea define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: s_lshr_b32 s5, s4, 1 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_not_b32 s4, s6 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_alignbit_b32 v0, s5, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_lshr_b32 s1, s0, 1 +; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; SI-NEXT: s_not_b32 s0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_not_b32 s3, s6 -; VI-NEXT: s_lshr_b32 s2, s4, 1 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: s_lshr_b32 s1, s0, 1 +; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_not_b32 s3, s6 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32: @@ -75,30 +75,30 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-LABEL: fshl_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s4, s5, 1 -; GFX10-NEXT: s_lshr_b32 s2, s4, 1 -; GFX10-NEXT: s_not_b32 s3, s6 -; GFX10-NEXT: v_alignbit_b32 v0, s2, v0, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX10-NEXT: s_lshr_b32 s0, s0, 1 +; GFX10-NEXT: s_not_b32 s1, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, 1 -; GFX11-NEXT: s_lshr_b32 s2, s4, 1 -; GFX11-NEXT: s_not_b32 s3, s6 +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-NEXT: s_not_b32 s1, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s2, v0, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) @@ -109,7 +109,7 @@ entry: define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-LABEL: fshl_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -122,7 +122,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshl_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 @@ -133,12 +133,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 25 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32_imm: @@ -155,16 +155,16 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshl_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 25 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 @@ -179,70 +179,70 @@ entry: define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: s_not_b32 s1, s1 -; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; SI-NEXT: s_lshr_b32 s2, s5, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_not_b32 s0, s0 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_lshr_b32 s1, s4, 1 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, v2 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_alignbit_b32 v0, s1, v0, 1 +; SI-NEXT: s_not_b32 s3, s5 +; SI-NEXT: s_lshr_b32 s1, s1, 1 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_not_b32 s1, s4 +; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; SI-NEXT: s_lshr_b32 s0, s0, 1 +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_not_b32 s1, s1 -; VI-NEXT: s_lshr_b32 s7, s5, 1 -; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_not_b32 s0, s0 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: s_lshr_b32 s1, s4, 1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, v2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_not_b32 s7, s7 +; VI-NEXT: s_lshr_b32 s3, s1, 1 +; VI-NEXT: v_alignbit_b32 v0, s1, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_not_b32 s1, s6 +; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; VI-NEXT: s_lshr_b32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s2, s5, 1 -; GFX9-NEXT: s_not_b32 s3, s9 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_not_b32 s3, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_lshr_b32 s3, s1, 1 +; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, 1 +; GFX9-NEXT: s_not_b32 s1, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_not_b32 s1, s8 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: @@ -266,39 +266,39 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s5, s7, 1 -; GFX10-NEXT: v_alignbit_b32 v3, s4, s6, 1 -; GFX10-NEXT: s_lshr_b32 s2, s5, 1 -; GFX10-NEXT: s_not_b32 s1, s1 -; GFX10-NEXT: s_lshr_b32 s3, s4, 1 -; GFX10-NEXT: s_not_b32 s0, s0 -; GFX10-NEXT: v_alignbit_b32 v1, s2, v0, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s0 +; GFX10-NEXT: v_alignbit_b32 v0, s1, s3, 1 +; GFX10-NEXT: v_alignbit_b32 v3, s0, s2, 1 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_not_b32 s2, s7 +; GFX10-NEXT: s_lshr_b32 s0, s0, 1 +; GFX10-NEXT: s_not_b32 s3, s6 +; GFX10-NEXT: v_alignbit_b32 v1, s1, v0, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s0, v3, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s5, s7, 1 -; GFX11-NEXT: v_alignbit_b32 v3, s4, s6, 1 -; GFX11-NEXT: s_lshr_b32 s5, s5, 1 -; GFX11-NEXT: s_not_b32 s1, s1 -; GFX11-NEXT: s_lshr_b32 s4, s4, 1 -; GFX11-NEXT: s_not_b32 s0, s0 -; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s1 -; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_alignbit_b32 v0, s1, s3, 1 +; GFX11-NEXT: v_alignbit_b32 v3, s0, s2, 1 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_not_b32 s2, s7 +; GFX11-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-NEXT: s_not_b32 s3, s6 +; GFX11-NEXT: v_alignbit_b32 v1, s1, v0, s2 +; GFX11-NEXT: v_alignbit_b32 v0, s0, v3, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) @@ -309,43 +309,43 @@ entry: define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshl_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, 23 -; SI-NEXT: v_alignbit_b32 v0, s4, v2, 25 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_alignbit_b32 v1, s1, v0, 23 +; SI-NEXT: v_alignbit_b32 v0, s0, v2, 25 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23 -; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_alignbit_b32 v1, s1, v0, 23 +; VI-NEXT: v_alignbit_b32 v0, s0, v2, 25 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 23 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 25 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32_imm: @@ -365,25 +365,25 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshl_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 23 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 25 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 23 +; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 25 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 25 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 23 +; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 25 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) @@ -394,106 +394,106 @@ entry: define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_not_b32 s1, s19 -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v3, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_not_b32 s1, s18 -; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s6, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v2, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: s_not_b32 s1, s17 -; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s5, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v1, s0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_not_b32 s1, s16 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s4, 1 -; SI-NEXT: v_mov_b32_e32 v4, s1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: s_not_b32 s5, s19 +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1 +; SI-NEXT: s_lshr_b32 s4, s11, 1 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_alignbit_b32 v3, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: s_not_b32 s5, s18 +; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; SI-NEXT: s_lshr_b32 s4, s10, 1 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_alignbit_b32 v2, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: s_not_b32 s5, s17 +; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1 +; SI-NEXT: s_lshr_b32 s4, s9, 1 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_alignbit_b32 v1, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: s_not_b32 s5, s16 +; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; SI-NEXT: s_lshr_b32 s4, s8, 1 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: s_not_b32 s3, s15 -; VI-NEXT: s_lshr_b32 s2, s7, 1 -; VI-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v3, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: s_not_b32 s3, s14 -; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; VI-NEXT: s_lshr_b32 s2, s6, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: s_not_b32 s3, s13 -; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; VI-NEXT: s_lshr_b32 s2, s5, 1 +; VI-NEXT: v_mov_b32_e32 v0, s15 +; VI-NEXT: s_not_b32 s3, s3 +; VI-NEXT: s_lshr_b32 s6, s11, 1 +; VI-NEXT: v_alignbit_b32 v0, s11, v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_alignbit_b32 v3, s6, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s14 +; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; VI-NEXT: s_lshr_b32 s3, s10, 1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s13 +; VI-NEXT: s_not_b32 s1, s1 +; VI-NEXT: v_alignbit_b32 v0, s9, v0, 1 +; VI-NEXT: s_lshr_b32 s2, s9, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_not_b32 s3, s12 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: s_lshr_b32 s2, s4, 1 -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_alignbit_b32 v0, s2, v0, v4 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; VI-NEXT: s_lshr_b32 s1, s8, 1 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_not_b32 s3, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: s_lshr_b32 s2, s7, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v3, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: s_not_b32 s3, s14 -; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v2, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: s_not_b32 s3, s13 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s5, 1 +; GFX9-NEXT: s_not_b32 s3, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s15 +; GFX9-NEXT: s_lshr_b32 s4, s11, 1 +; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v3, s4, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; GFX9-NEXT: s_lshr_b32 s3, s10, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s9, v0, 1 +; GFX9-NEXT: s_lshr_b32 s2, s9, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_not_b32 s3, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, v5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; GFX9-NEXT: s_lshr_b32 s1, s8, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32: @@ -525,55 +525,55 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: fshl_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 -; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 -; GFX10-NEXT: v_alignbit_b32 v5, s5, s9, 1 -; GFX10-NEXT: v_alignbit_b32 v6, s4, s8, 1 -; GFX10-NEXT: s_lshr_b32 s2, s7, 1 -; GFX10-NEXT: s_not_b32 s3, s15 -; GFX10-NEXT: s_lshr_b32 s6, s6, 1 -; GFX10-NEXT: s_not_b32 s7, s14 -; GFX10-NEXT: s_lshr_b32 s5, s5, 1 -; GFX10-NEXT: s_not_b32 s9, s13 -; GFX10-NEXT: s_lshr_b32 s4, s4, 1 -; GFX10-NEXT: s_not_b32 s8, s12 -; GFX10-NEXT: v_alignbit_b32 v3, s2, v0, s3 -; GFX10-NEXT: v_alignbit_b32 v2, s6, v1, s7 -; GFX10-NEXT: v_alignbit_b32 v1, s5, v5, s9 -; GFX10-NEXT: v_alignbit_b32 v0, s4, v6, s8 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: v_alignbit_b32 v0, s11, s15, 1 +; GFX10-NEXT: v_alignbit_b32 v1, s10, s14, 1 +; GFX10-NEXT: v_alignbit_b32 v5, s9, s13, 1 +; GFX10-NEXT: v_alignbit_b32 v6, s8, s12, 1 +; GFX10-NEXT: s_lshr_b32 s4, s11, 1 +; GFX10-NEXT: s_not_b32 s3, s3 +; GFX10-NEXT: s_lshr_b32 s5, s10, 1 +; GFX10-NEXT: s_not_b32 s2, s2 +; GFX10-NEXT: s_lshr_b32 s9, s9, 1 +; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_lshr_b32 s8, s8, 1 +; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: v_alignbit_b32 v3, s4, v0, s3 +; GFX10-NEXT: v_alignbit_b32 v2, s5, v1, s2 +; GFX10-NEXT: v_alignbit_b32 v1, s9, v5, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s8, v6, s0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[12:15], s[2:3], 0x54 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s7, s11, 1 -; GFX11-NEXT: v_alignbit_b32 v1, s6, s10, 1 -; GFX11-NEXT: v_alignbit_b32 v5, s5, s9, 1 -; GFX11-NEXT: v_alignbit_b32 v6, s4, s8, 1 -; GFX11-NEXT: s_lshr_b32 s2, s7, 1 -; GFX11-NEXT: s_not_b32 s3, s15 -; GFX11-NEXT: s_lshr_b32 s6, s6, 1 -; GFX11-NEXT: s_not_b32 s7, s14 -; GFX11-NEXT: s_lshr_b32 s5, s5, 1 -; GFX11-NEXT: s_not_b32 s9, s13 -; GFX11-NEXT: s_lshr_b32 s4, s4, 1 -; GFX11-NEXT: s_not_b32 s8, s12 -; GFX11-NEXT: v_alignbit_b32 v3, s2, v0, s3 -; GFX11-NEXT: v_alignbit_b32 v2, s6, v1, s7 -; GFX11-NEXT: v_alignbit_b32 v1, s5, v5, s9 -; GFX11-NEXT: v_alignbit_b32 v0, s4, v6, s8 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: v_alignbit_b32 v0, s11, s15, 1 +; GFX11-NEXT: v_alignbit_b32 v1, s10, s14, 1 +; GFX11-NEXT: v_alignbit_b32 v5, s9, s13, 1 +; GFX11-NEXT: v_alignbit_b32 v6, s8, s12, 1 +; GFX11-NEXT: s_lshr_b32 s6, s11, 1 +; GFX11-NEXT: s_not_b32 s3, s3 +; GFX11-NEXT: s_lshr_b32 s7, s10, 1 +; GFX11-NEXT: s_not_b32 s2, s2 +; GFX11-NEXT: s_lshr_b32 s9, s9, 1 +; GFX11-NEXT: s_not_b32 s1, s1 +; GFX11-NEXT: s_lshr_b32 s8, s8, 1 +; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: v_alignbit_b32 v3, s6, v0, s3 +; GFX11-NEXT: v_alignbit_b32 v2, s7, v1, s2 +; GFX11-NEXT: v_alignbit_b32 v1, s9, v5, s1 +; GFX11-NEXT: v_alignbit_b32 v0, s8, v6, s0 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) @@ -584,54 +584,54 @@ entry: define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshl_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_alignbit_b32 v2, s6, v1, 23 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, 25 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 31 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_alignbit_b32 v2, s10, v1, 23 +; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: v_mov_b32_e32 v4, s9 -; VI-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; VI-NEXT: v_alignbit_b32 v2, s6, v1, 23 -; VI-NEXT: v_alignbit_b32 v1, s5, v4, 25 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v0, s15 +; VI-NEXT: v_mov_b32_e32 v1, s14 +; VI-NEXT: v_mov_b32_e32 v4, s13 +; VI-NEXT: v_alignbit_b32 v3, s11, v0, 31 +; VI-NEXT: v_alignbit_b32 v2, s10, v1, 23 +; VI-NEXT: v_alignbit_b32 v1, s9, v4, 25 +; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 31 +; VI-NEXT: v_alignbit_b32 v0, s8, v0, 31 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s15 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 23 +; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 31 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -656,28 +656,28 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-LABEL: fshl_v4i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 31 -; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 23 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 25 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 31 +; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 31 +; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 23 +; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 25 +; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 31 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v4i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 31 -; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 23 -; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 25 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 31 +; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 31 +; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 23 +; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 25 +; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 31 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -690,7 +690,7 @@ entry: define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-LABEL: orxor2or1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -706,7 +706,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; VI-LABEL: orxor2or1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s4, s2, 7 ; VI-NEXT: s_or_b32 s4, s3, s4 @@ -720,15 +720,15 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX9-LABEL: orxor2or1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s6, 7 -; GFX9-NEXT: s_or_b32 s0, s7, s0 -; GFX9-NEXT: s_cmp_eq_u32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s0, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_lshl_b32 s4, s2, 7 +; GFX9-NEXT: s_or_b32 s4, s3, s4 +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: orxor2or1: @@ -747,20 +747,20 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX10-LABEL: orxor2or1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s0, s6, 7 -; GFX10-NEXT: s_or_b32 s0, s7, s0 -; GFX10-NEXT: s_cmp_eq_u32 s0, 0 -; GFX10-NEXT: s_cselect_b32 s0, s6, s7 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_lshl_b32 s4, s2, 7 +; GFX10-NEXT: s_or_b32 s4, s3, s4 +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s2, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: orxor2or1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s2, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 551af1aa5cf7ae..5ca81ce9f9e073 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -22,40 +22,40 @@ declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_alignbit_b32 v2, s4, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32: @@ -72,25 +72,25 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-LABEL: fshr_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s5, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) @@ -101,7 +101,7 @@ entry: define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-LABEL: fshr_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshr_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 @@ -125,12 +125,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 7 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32_imm: @@ -147,16 +147,16 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshr_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 7 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 @@ -171,51 +171,51 @@ entry: define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_alignbit_b32 v0, s4, v2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_alignbit_b32 v1, s1, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_alignbit_b32 v0, s0, v2, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; @@ -236,31 +236,31 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, v0 +; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, v2 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3] +; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, v0 +; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) @@ -271,43 +271,43 @@ entry: define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshr_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, 9 -; SI-NEXT: v_alignbit_b32 v0, s4, v2, 7 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_alignbit_b32 v1, s1, v0, 9 +; SI-NEXT: v_alignbit_b32 v0, s0, v2, 7 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9 -; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_alignbit_b32 v1, s1, v0, 9 +; VI-NEXT: v_alignbit_b32 v0, s0, v2, 7 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 9 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 7 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm: @@ -327,25 +327,25 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshr_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 9 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 7 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 9 +; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 7 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 9 +; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) @@ -356,70 +356,70 @@ entry: define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s19, 0xf000 -; SI-NEXT: s_mov_b32 s18, -1 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_alignbit_b32 v2, s6, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_alignbit_b32 v3, s7, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s14 -; VI-NEXT: v_alignbit_b32 v2, s6, v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s15 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_alignbit_b32 v3, s11, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_alignbit_b32 v2, s10, v2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s13 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_alignbit_b32 v1, s9, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_alignbit_b32 v0, s8, v0, v4 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s15 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_alignbit_b32 v2, s10, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, v5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v4i32: @@ -443,39 +443,39 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: fshr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s15 -; GFX10-NEXT: v_mov_b32_e32 v1, s14 -; GFX10-NEXT: v_mov_b32_e32 v4, s13 -; GFX10-NEXT: v_mov_b32_e32 v5, s12 -; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, v0 -; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, v1 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, v4 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, v0 +; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, v1 +; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, v4 +; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, v5 +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[12:15], s[2:3], 0x54 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14 -; GFX11-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v5, s12 +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, v0 -; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, v1 +; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, v0 +; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, v4 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, v5 -; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, v4 +; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, v5 +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) @@ -486,54 +486,54 @@ entry: define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshr_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_alignbit_b32 v2, s6, v1, 9 -; SI-NEXT: v_alignbit_b32 v1, s5, v0, 7 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1 +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_alignbit_b32 v2, s10, v1, 9 +; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: v_mov_b32_e32 v4, s9 -; VI-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9 -; VI-NEXT: v_alignbit_b32 v1, s5, v4, 7 -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v0, s15 +; VI-NEXT: v_mov_b32_e32 v1, s14 +; VI-NEXT: v_mov_b32_e32 v4, s13 +; VI-NEXT: v_alignbit_b32 v3, s11, v0, 1 +; VI-NEXT: v_alignbit_b32 v2, s10, v1, 9 +; VI-NEXT: v_alignbit_b32 v1, s9, v4, 7 +; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s15 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 9 +; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 7 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -556,28 +556,28 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-LABEL: fshr_v4i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1 -; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 +; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 1 +; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 9 +; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 7 +; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 1 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1 -; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 9 -; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 7 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 1 +; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 1 +; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 9 +; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 7 +; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 1 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll index c6c145e090829c..8e04a240d0a1c3 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll @@ -3975,8 +3975,8 @@ define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) { define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %in) { ; SDAG-IEEE-LABEL: elim_redun_check_neg0: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dword s0, s[2:3], 0xb -; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dword s0, s[4:5], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 @@ -4005,17 +4005,18 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; ; GISEL-IEEE-LABEL: elim_redun_check_neg0: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dword s6, s[2:3], 0xb -; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dword s2, s[4:5], 0xb ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s6 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 ; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 ; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 @@ -4031,26 +4032,25 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-IEEE-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-IEEE-NEXT: s_endpgm ; ; SDAG-DAZ-LABEL: elim_redun_check_neg0: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dword s0, s[2:3], 0xb +; SDAG-DAZ-NEXT: s_load_dword s0, s[4:5], 0xb ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 ; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 ; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 -; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4069,18 +4069,17 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; ; GISEL-DAZ-LABEL: elim_redun_check_neg0: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dword s4, s[2:3], 0xb -; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dword s2, s[4:5], 0xb +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s4 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s4, v1 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4095,8 +4094,9 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-DAZ-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-DAZ-NEXT: s_endpgm entry: @@ -4110,8 +4110,8 @@ entry: define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %in) { ; SDAG-IEEE-LABEL: elim_redun_check_pos0: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dword s0, s[2:3], 0xb -; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dword s0, s[4:5], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 @@ -4140,17 +4140,18 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; ; GISEL-IEEE-LABEL: elim_redun_check_pos0: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dword s6, s[2:3], 0xb -; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dword s2, s[4:5], 0xb ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s6 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 ; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 ; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 @@ -4165,26 +4166,25 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, s6, 0 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, s2, 0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-IEEE-NEXT: s_endpgm ; ; SDAG-DAZ-LABEL: elim_redun_check_pos0: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dword s0, s[2:3], 0xb +; SDAG-DAZ-NEXT: s_load_dword s0, s[4:5], 0xb ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 ; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 ; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 -; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4203,18 +4203,17 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; ; GISEL-DAZ-LABEL: elim_redun_check_pos0: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dword s4, s[2:3], 0xb -; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dword s2, s[4:5], 0xb +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s4 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s4, v1 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4228,8 +4227,9 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, s4, 0 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, s2, 0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-DAZ-NEXT: s_endpgm entry: @@ -4243,8 +4243,8 @@ entry: define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in) { ; SDAG-IEEE-LABEL: elim_redun_check_ult: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dword s0, s[2:3], 0xb -; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dword s0, s[4:5], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 @@ -4273,17 +4273,18 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; ; GISEL-IEEE-LABEL: elim_redun_check_ult: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dword s6, s[2:3], 0xb -; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dword s2, s[4:5], 0xb ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s6 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 ; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 ; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 @@ -4299,26 +4300,25 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-IEEE-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-IEEE-NEXT: v_cmp_nge_f32_e32 vcc, s6, v1 +; GISEL-IEEE-NEXT: v_cmp_nge_f32_e32 vcc, s2, v1 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-IEEE-NEXT: s_endpgm ; ; SDAG-DAZ-LABEL: elim_redun_check_ult: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dword s0, s[2:3], 0xb +; SDAG-DAZ-NEXT: s_load_dword s0, s[4:5], 0xb ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 ; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 ; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 -; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; SDAG-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4337,18 +4337,17 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; ; GISEL-DAZ-LABEL: elim_redun_check_ult: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dword s4, s[2:3], 0xb -; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dword s2, s[4:5], 0xb +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s4 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s4, v1 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4363,8 +4362,9 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-DAZ-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-DAZ-NEXT: v_cmp_nge_f32_e32 vcc, s4, v1 +; GISEL-DAZ-NEXT: v_cmp_nge_f32_e32 vcc, s2, v1 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-DAZ-NEXT: s_endpgm entry: @@ -4378,7 +4378,7 @@ entry: define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 @@ -4426,7 +4426,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; ; GISEL-IEEE-LABEL: elim_redun_check_v2: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) @@ -4478,7 +4478,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; ; SDAG-DAZ-LABEL: elim_redun_check_v2: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-DAZ-NEXT: s_mov_b32 s7, 0xf000 @@ -4524,7 +4524,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; ; GISEL-DAZ-LABEL: elim_redun_check_v2: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) @@ -4582,7 +4582,7 @@ entry: define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2_ult: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 @@ -4630,7 +4630,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; ; GISEL-IEEE-LABEL: elim_redun_check_v2_ult: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) @@ -4682,7 +4682,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; ; SDAG-DAZ-LABEL: elim_redun_check_v2_ult: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-DAZ-NEXT: s_mov_b32 s7, 0xf000 @@ -4728,7 +4728,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; ; GISEL-DAZ-LABEL: elim_redun_check_v2_ult: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll index 5495f0a8b0b714..123b43cf761431 100644 --- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll @@ -7,73 +7,73 @@ define amdgpu_kernel void @fsub_f16( ; SI-LABEL: fsub_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; GFX89-LABEL: fsub_f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX89-NEXT: s_mov_b32 s11, 0xf000 -; GFX89-NEXT: s_mov_b32 s10, -1 -; GFX89-NEXT: s_mov_b32 s14, s10 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s14, s6 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s12, s6 -; GFX89-NEXT: s_mov_b32 s13, s7 -; GFX89-NEXT: s_mov_b32 s15, s11 -; GFX89-NEXT: s_mov_b32 s2, s10 -; GFX89-NEXT: s_mov_b32 s3, s11 +; GFX89-NEXT: s_mov_b32 s12, s2 +; GFX89-NEXT: s_mov_b32 s13, s3 +; GFX89-NEXT: s_mov_b32 s15, s7 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 ; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s4 -; GFX89-NEXT: s_mov_b32 s9, s5 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 ; GFX89-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX89-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm @@ -91,7 +91,7 @@ entry: define amdgpu_kernel void @fsub_f16_imm_a( ; SI-LABEL: fsub_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -109,45 +109,27 @@ define amdgpu_kernel void @fsub_f16_imm_a( ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: fsub_f16_imm_a: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_sub_f16_e32 v0, 1.0, v0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fsub_f16_imm_a: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_sub_f16_e32 v0, 1.0, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fsub_f16_imm_a: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_sub_f16_e32 v0, 1.0, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -174,7 +156,7 @@ entry: define amdgpu_kernel void @fsub_f16_imm_b( ; SI-LABEL: fsub_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -192,45 +174,27 @@ define amdgpu_kernel void @fsub_f16_imm_b( ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: fsub_f16_imm_b: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_add_f16_e32 v0, -2.0, v0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fsub_f16_imm_b: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_add_f16_e32 v0, -2.0, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: fsub_f16_imm_b: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_add_f16_e32 v0, -2.0, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -257,21 +221,21 @@ entry: define amdgpu_kernel void @fsub_v2f16( ; SI-LABEL: fsub_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -286,73 +250,73 @@ define amdgpu_kernel void @fsub_v2f16( ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fsub_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_f16_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fsub_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s14, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -371,7 +335,7 @@ entry: define amdgpu_kernel void @fsub_v2f16_imm_a( ; SI-LABEL: fsub_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -397,7 +361,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; ; VI-LABEL: fsub_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -418,26 +382,26 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; ; GFX9-LABEL: fsub_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x40003c00 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 neg_lo:[1,0] neg_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -464,7 +428,7 @@ entry: define amdgpu_kernel void @fsub_v2f16_imm_b( ; SI-LABEL: fsub_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -490,7 +454,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; ; VI-LABEL: fsub_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -511,26 +475,26 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; ; GFX9-LABEL: fsub_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 0xbc00c000 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xbc00c000 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 8846068e750d46..6fd2c5a1267fb8 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -6,7 +6,7 @@ define void @void_func_i1_inreg(i1 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i1_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s6, 1 +; GFX9-NEXT: s_and_b32 s4, s16, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -28,7 +28,7 @@ define void @void_func_i8_inreg(i8 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i8_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -47,7 +47,7 @@ define void @void_func_i16_inreg(i16 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -66,7 +66,7 @@ define void @void_func_i32_inreg(i32 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -85,8 +85,8 @@ define void @void_func_i64_inreg(i64 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -105,7 +105,7 @@ define void @void_func_f16_inreg(half inreg %arg0) #0 { ; GFX9-LABEL: void_func_f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -124,7 +124,7 @@ define void @void_func_f32_inreg(float inreg %arg0) #0 { ; GFX9-LABEL: void_func_f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -143,8 +143,8 @@ define void @void_func_f64_inreg(double inreg %arg0) #0 { ; GFX9-LABEL: void_func_f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -163,7 +163,7 @@ define void @void_func_v2i16_inreg(<2 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -182,9 +182,9 @@ define void @void_func_v3i16_inreg(<3 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -214,8 +214,8 @@ define void @void_func_v4i16_inreg(<4 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -234,10 +234,10 @@ define void @void_func_v5i16_inreg(<5 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v5i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -259,10 +259,10 @@ define void @void_func_v8i16_inreg(<8 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -282,8 +282,8 @@ define void @void_func_v2i32_inreg(<2 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -302,9 +302,9 @@ define void @void_func_v3i32_inreg(<3 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: global_store_dwordx3 v[0:1], v[0:2], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -324,10 +324,10 @@ define void @void_func_v4i32_inreg(<4 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -347,12 +347,12 @@ define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v5i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 ; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -360,7 +360,7 @@ define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v5i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_clause 0x1 @@ -375,16 +375,16 @@ define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -392,8 +392,8 @@ define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -437,28 +437,28 @@ define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -466,12 +466,12 @@ define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 +; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27 +; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 +; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -488,33 +488,31 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v32i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[14:17], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[16:19], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -522,25 +520,25 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v32i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v14, v0 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 -; GFX11-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 -; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17 +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v13, s29 +; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 +; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27 +; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 +; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off ; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off ; GFX11-NEXT: s_setpc_b64 s[30:31] store <32 x i32> %arg0, ptr addrspace(1) undef @@ -551,10 +549,10 @@ define void @void_func_v2i64_inreg(<2 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -574,13 +572,13 @@ define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -588,7 +586,7 @@ define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v3i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x1 @@ -603,16 +601,16 @@ define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -620,8 +618,8 @@ define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v4i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -636,20 +634,20 @@ define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v5i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -657,11 +655,11 @@ define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v5i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off @@ -675,28 +673,28 @@ define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -704,12 +702,12 @@ define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 +; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27 +; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 +; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -726,33 +724,31 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[14:17], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[16:19], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -760,25 +756,25 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v14, v0 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 -; GFX11-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 -; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17 +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v13, s29 +; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 +; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27 +; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 +; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off ; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off ; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x i64> %arg0, ptr addrspace(1) undef @@ -789,7 +785,7 @@ define void @void_func_v2f16_inreg(<2 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -808,9 +804,9 @@ define void @void_func_v3f16_inreg(<3 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -831,8 +827,8 @@ define void @void_func_v4f16_inreg(<4 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -851,10 +847,10 @@ define void @void_func_v8f16_inreg(<8 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -874,16 +870,16 @@ define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -891,8 +887,8 @@ define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16f16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -907,8 +903,8 @@ define void @void_func_v2f32_inreg(<2 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -927,9 +923,9 @@ define void @void_func_v3f32_inreg(<3 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: global_store_dwordx3 v[0:1], v[0:2], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -949,10 +945,10 @@ define void @void_func_v4f32_inreg(<4 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -972,16 +968,16 @@ define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -989,8 +985,8 @@ define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1005,28 +1001,28 @@ define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1034,12 +1030,12 @@ define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 +; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27 +; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 +; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -1056,10 +1052,10 @@ define void @void_func_v2f64_inreg(<2 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1079,13 +1075,13 @@ define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1093,7 +1089,7 @@ define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v3f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1108,16 +1104,16 @@ define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1125,8 +1121,8 @@ define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v4f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1141,28 +1137,28 @@ define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1170,12 +1166,12 @@ define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 +; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27 +; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 +; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -1192,33 +1188,31 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[14:17], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v17, s29 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[16:19], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1226,25 +1220,25 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v14, v0 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 -; GFX11-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 -; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17 +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v13, s29 +; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 +; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27 +; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 +; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off ; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off ; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x double> %arg0, ptr addrspace(1) undef @@ -1255,86 +1249,86 @@ define void @void_func_v32i32_i1_i8_i16_f32_inreg(<32 x i32> inreg %arg0, i1 inr ; GFX9-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GFX9-NEXT: v_mov_b32_e32 v25, v1 +; GFX9-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s29 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[14:17], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[10:13], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[6:9], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[22:25], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v18 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_byte v[0:1], v17, off +; GFX9-NEXT: global_store_byte v[0:1], v19, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v[0:1], v18, off +; GFX9-NEXT: global_store_short v[0:1], v20, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v[0:1], v19, off +; GFX9-NEXT: global_store_short v[0:1], v21, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v17, s27 +; GFX11-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_mov_b32 v20, v0 ; GFX11-NEXT: v_dual_mov_b32 v18, s28 :: v_dual_mov_b32 v19, s29 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[18:21], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 -; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 -; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 -; GFX11-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 +; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27 +; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 +; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 +; GFX11-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-NEXT: v_and_b32_e32 v12, 1, v14 ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[18:21], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b8 v[0:1], v12, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b8 v[0:1], v13, off dlc +; GFX11-NEXT: global_store_b8 v[0:1], v15, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b16 v[0:1], v14, off dlc +; GFX11-NEXT: global_store_b16 v[0:1], v16, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b16 v[0:1], v15, off dlc +; GFX11-NEXT: global_store_b16 v[0:1], v17, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef @@ -1349,77 +1343,77 @@ define void @void_func_v32i32_v2i32_v2f32_inreg(<32 x i32> inreg %arg0, <2 x i32 ; GFX9-LABEL: void_func_v32i32_v2i32_v2f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GFX9-NEXT: v_mov_b32_e32 v25, v1 +; GFX9-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s29 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[14:17], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[10:13], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[6:9], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[22:25], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[16:17], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[18:19], off ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[20:21], off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v2i32_v2f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc +; GFX11-NEXT: v_dual_mov_b32 v21, v1 :: v_dual_mov_b32 v20, v0 +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v18, s28 :: v_dual_mov_b32 v19, s29 +; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 +; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27 +; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 +; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 +; GFX11-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-NEXT: global_store_b128 v[0:1], v[18:21], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 -; GFX11-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7 -; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17 -; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1 -; GFX11-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[20:23], off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b64 v[0:1], v[12:13], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[22:25], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v[0:1], v[14:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b64 v[0:1], v[16:17], off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <2 x i32> %arg1, ptr addrspace(1) undef @@ -1431,53 +1425,47 @@ define void @too_many_args_use_workitem_id_x_inreg( ; GFX9-LABEL: too_many_args_use_workitem_id_x_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s6 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s7 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s16 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s17 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s18 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s20 -; GFX9-NEXT: global_store_dword v[0:1], v16, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s21 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s22 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s22 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s23 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s23 -; GFX9-NEXT: global_store_dword v[0:1], v16, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s24 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s24 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s25 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s25 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s26 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s26 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s27 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s28 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s28 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s29 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: global_store_dword v[0:1], v18, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1511,63 +1499,62 @@ define void @too_many_args_use_workitem_id_x_inreg( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v15, off ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v17, off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: too_many_args_use_workitem_id_x_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 -; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 -; GFX11-NEXT: v_mov_b32_e32 v16, s6 -; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s1 +; GFX11-NEXT: v_mov_b32_e32 v16, s2 +; GFX11-NEXT: v_mov_b32_e32 v18, s19 ; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v13, s16 :: v_dual_mov_b32 v12, s7 -; GFX11-NEXT: v_dual_mov_b32 v15, s18 :: v_dual_mov_b32 v14, s17 -; GFX11-NEXT: v_mov_b32_e32 v16, s19 -; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v17, s18 +; GFX11-NEXT: v_dual_mov_b32 v15, s16 :: v_dual_mov_b32 v16, s17 ; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v12, s20 -; GFX11-NEXT: v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v14, s22 -; GFX11-NEXT: v_mov_b32_e32 v16, s24 -; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v17, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v18, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v15, s21 :: v_dual_mov_b32 v14, s20 +; GFX11-NEXT: v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v17, s23 +; GFX11-NEXT: v_mov_b32_e32 v18, s24 ; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v12, s25 -; GFX11-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27 -; GFX11-NEXT: v_mov_b32_e32 v16, s29 -; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v17, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v18, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v14, s25 +; GFX11-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v16, s27 +; GFX11-NEXT: v_mov_b32_e32 v18, s29 ; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v17, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v18, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc @@ -1592,6 +1579,10 @@ define void @too_many_args_use_workitem_id_x_inreg( ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v11, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] i32 inreg %arg0, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15, @@ -1643,10 +1634,10 @@ define void @void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inreg %arg ; GFX9-LABEL: void_func_i32_v2float_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1669,24 +1660,24 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX9-LABEL: caller_void_func_i32_v2float_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s17, s33 +; GFX9-NEXT: s_mov_b32 s19, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s17, 2 +; GFX9-NEXT: s_getpc_b64 s[20:21] +; GFX9-NEXT: s_add_u32 s20, s20, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s21, s21, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[20:21], s[20:21], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s19, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s2, s16 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s2, s18 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -1762,7 +1753,7 @@ define void @void_func_bf16_inreg(bfloat inreg %arg0) #0 { ; GFX9-LABEL: void_func_bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1781,7 +1772,7 @@ define void @void_func_v2bf16_inreg(<2 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1800,9 +1791,9 @@ define void @void_func_v3bf16_inreg(<3 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1823,8 +1814,8 @@ define void @void_func_v4bf16_inreg(<4 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1843,10 +1834,10 @@ define void @void_func_v8bf16_inreg(<8 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1866,16 +1857,16 @@ define void @void_func_v16bf16_inreg(<16 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1883,8 +1874,8 @@ define void @void_func_v16bf16_inreg(<16 x bfloat> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1899,10 +1890,10 @@ define void @void_func_2_i32_inreg(i32 inreg %arg0, i32 inreg %arg1, ptr addrspa ; GFX9-LABEL: void_func_2_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1925,10 +1916,10 @@ define void @void_func_2_i64_inreg(i64 inreg %arg0, i64 inreg %arg1, ptr addrspa ; GFX9-LABEL: void_func_2_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -1954,13 +1945,13 @@ define void @void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg ; GFX9-LABEL: void_func_i64_inreg_i32_inreg_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_mov_b32_e32 v3, s18 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v2, s19 +; GFX9-NEXT: v_mov_b32_e32 v3, s20 ; GFX9-NEXT: global_store_dword v[0:1], v4, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -1971,7 +1962,7 @@ define void @void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s1 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v3, s16 ; GFX11-NEXT: v_mov_b32_e32 v6, s2 ; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1990,19 +1981,19 @@ define void @void_func_5_i32_inreg(i32 inreg %arg0, i32 inreg %arg1, i32 inreg % ; GFX9-LABEL: void_func_5_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s19 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2012,7 +2003,7 @@ define void @void_func_5_i32_inreg(i32 inreg %arg0, i32 inreg %arg1, i32 inreg % ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s16 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc @@ -2036,12 +2027,12 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) ; GFX9-LABEL: void_func_a5i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 ; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:16 -; GFX9-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2049,7 +2040,7 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) ; GFX11-LABEL: void_func_a5i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s3 ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: s_clause 0x1 @@ -2067,34 +2058,34 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; GFX9-LABEL: void_func_a13i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s27, s33 +; GFX9-NEXT: s_mov_b32 s29, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GFX9-NEXT: s_or_saveexec_b64 vcc, -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[28:29] -; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: s_mov_b64 exec, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s28 ; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:48 -; GFX9-NEXT: v_mov_b32_e32 v5, s25 -; GFX9-NEXT: v_mov_b32_e32 v4, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v5, s27 +; GFX9-NEXT: v_mov_b32_e32 v4, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s24 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 -; GFX9-NEXT: v_writelane_b32 v40, s27, 2 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_writelane_b32 v40, s29, 2 +; GFX9-NEXT: v_mov_b32_e32 v5, s23 +; GFX9-NEXT: v_mov_b32_e32 v4, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s20 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: s_getpc_b64 s[16:17] ; GFX9-NEXT: s_add_u32 s16, s16, extern@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s17, s17, extern@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2113,24 +2104,24 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; GFX11-LABEL: void_func_a13i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s23, s33 +; GFX11-NEXT: s_mov_b32 s25, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s24, -1 +; GFX11-NEXT: s_or_saveexec_b32 s26, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s24 +; GFX11-NEXT: s_mov_b32 exec_lo, s26 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v3, s19 -; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v9, s17 -; GFX11-NEXT: s_getpc_b64 s[18:19] -; GFX11-NEXT: s_add_u32 s18, s18, extern@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s19, s19, extern@gotpcrel32@hi+12 -; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v7, s7 -; GFX11-NEXT: s_load_b64 s[16:17], s[18:19], 0x0 -; GFX11-NEXT: v_writelane_b32 v40, s23, 2 -; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v5, s21 -; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v13, s3 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v3, s21 +; GFX11-NEXT: v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v9, s19 +; GFX11-NEXT: s_getpc_b64 s[20:21] +; GFX11-NEXT: s_add_u32 s20, s20, extern@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s21, s21, extern@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v13, s3 +; GFX11-NEXT: s_load_b64 s[16:17], s[20:21], 0x0 +; GFX11-NEXT: v_writelane_b32 v40, s25, 2 +; GFX11-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v5, s23 ; GFX11-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v10, s0 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_store_b32 v[0:1], v14, off offset:48 @@ -2181,41 +2172,41 @@ define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addr ; GFX9-LABEL: void_func_a16i32_inreg__noimplicit: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 ; GFX9-NEXT: v_mov_b32_e32 v5, s29 ; GFX9-NEXT: v_mov_b32_e32 v4, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:48 +; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:48 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v7, s27 +; GFX9-NEXT: v_mov_b32_e32 v6, s26 ; GFX9-NEXT: v_mov_b32_e32 v5, s25 ; GFX9-NEXT: v_mov_b32_e32 v4, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s22 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 +; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:32 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 +; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 ; GFX9-NEXT: v_mov_b32_e32 v5, s17 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_a16i32_inreg__noimplicit: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s25 :: v_dual_mov_b32 v4, s24 -; GFX11-NEXT: v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s22 -; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20 -; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18 -; GFX11-NEXT: v_dual_mov_b32 v13, s17 :: v_dual_mov_b32 v12, s16 -; GFX11-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6 +; GFX11-NEXT: v_dual_mov_b32 v5, s27 :: v_dual_mov_b32 v4, s26 +; GFX11-NEXT: v_dual_mov_b32 v3, s25 :: v_dual_mov_b32 v2, s24 +; GFX11-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v8, s22 +; GFX11-NEXT: v_dual_mov_b32 v7, s21 :: v_dual_mov_b32 v6, s20 +; GFX11-NEXT: v_dual_mov_b32 v13, s19 :: v_dual_mov_b32 v12, s18 +; GFX11-NEXT: v_dual_mov_b32 v11, s17 :: v_dual_mov_b32 v10, s16 ; GFX11-NEXT: v_dual_mov_b32 v17, s3 :: v_dual_mov_b32 v16, s2 ; GFX11-NEXT: v_dual_mov_b32 v15, s1 :: v_dual_mov_b32 v14, s0 ; GFX11-NEXT: s_clause 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll index d3a6b4e01ebfb8..eda1e33cc4b9eb 100644 --- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll @@ -336,14 +336,14 @@ define amdgpu_kernel void @indirect_2_level_use_stack() #0 { ; GCN-LABEL: {{^}}multi_call_use_use_stack: ; GCN: .set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr) ; GCN: .set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr) -; GCN: .set multi_call_use_use_stack.numbered_sgpr, max(42, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr) +; GCN: .set multi_call_use_use_stack.numbered_sgpr, max(44, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr) ; GCN: .set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) ; GCN: .set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc) ; GCN: .set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch) ; GCN: .set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack) ; GCN: .set multi_call_use_use_stack.has_recursion, or(0, use_stack0.has_recursion, use_stack1.has_recursion) ; GCN: .set multi_call_use_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call) -; GCN: TotalNumSgprs: 48 +; GCN: TotalNumSgprs: 50 ; GCN: NumVgprs: 41 ; GCN: ScratchSize: 2052 define amdgpu_kernel void @multi_call_use_use_stack() #0 { @@ -357,7 +357,7 @@ declare void @external() #0 ; GCN-LABEL: {{^}}multi_call_with_external: ; GCN: .set multi_call_with_external.num_vgpr, max(41, amdgpu.max_num_vgpr) ; GCN: .set multi_call_with_external.num_agpr, max(0, amdgpu.max_num_agpr) -; GCN: .set multi_call_with_external.numbered_sgpr, max(42, amdgpu.max_num_sgpr) +; GCN: .set multi_call_with_external.numbered_sgpr, max(44, amdgpu.max_num_sgpr) ; GCN: .set multi_call_with_external.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) ; GCN: .set multi_call_with_external.uses_vcc, 1 ; GCN: .set multi_call_with_external.uses_flat_scratch, 1 @@ -377,7 +377,7 @@ define amdgpu_kernel void @multi_call_with_external() #0 { ; GCN-LABEL: {{^}}multi_call_with_external_and_duplicates: ; GCN: .set multi_call_with_external_and_duplicates.num_vgpr, max(41, amdgpu.max_num_vgpr) ; GCN: .set multi_call_with_external_and_duplicates.num_agpr, max(0, amdgpu.max_num_agpr) -; GCN: .set multi_call_with_external_and_duplicates.numbered_sgpr, max(44, amdgpu.max_num_sgpr) +; GCN: .set multi_call_with_external_and_duplicates.numbered_sgpr, max(46, amdgpu.max_num_sgpr) ; GCN: .set multi_call_with_external_and_duplicates.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) ; GCN: .set multi_call_with_external_and_duplicates.uses_vcc, 1 ; GCN: .set multi_call_with_external_and_duplicates.uses_flat_scratch, 1 diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll index 2491cc0d19d5a1..fc3915fd2e9b76 100644 --- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll +++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_or3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -31,7 +31,7 @@ bb: define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_or3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16 @@ -61,7 +61,7 @@ bb: define amdgpu_kernel void @divergent_and3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_and3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -89,7 +89,7 @@ bb: define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_and3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] @@ -122,7 +122,7 @@ bb: define amdgpu_kernel void @divergent_xor3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_xor3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -149,7 +149,7 @@ bb: define amdgpu_kernel void @divergent_xor3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_xor3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] @@ -180,15 +180,15 @@ bb: define amdgpu_kernel void @uniform_or3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_or3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: s_nor_b32 s0, s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: global_store_dword v0, v1, s[4:5] +; GCN-NEXT: global_store_dword v0, v1, s[6:7] ; GCN-NEXT: s_endpgm bb: %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16 @@ -205,17 +205,17 @@ bb: define amdgpu_kernel void @uniform_or3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_or3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_nor_b64 s[0:1], s[0:1], s[6:7] +; GCN-NEXT: s_nor_b64 s[0:1], s[0:1], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GCN-NEXT: s_endpgm bb: %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32 @@ -232,15 +232,15 @@ bb: define amdgpu_kernel void @uniform_and3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_and3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s0, s1, s0 ; GCN-NEXT: s_nand_b32 s0, s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: global_store_dword v0, v1, s[4:5] +; GCN-NEXT: global_store_dword v0, v1, s[6:7] ; GCN-NEXT: s_endpgm bb: %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16 @@ -257,17 +257,17 @@ bb: define amdgpu_kernel void @uniform_and3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_and3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_nand_b64 s[0:1], s[0:1], s[6:7] +; GCN-NEXT: s_nand_b64 s[0:1], s[0:1], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GCN-NEXT: s_endpgm bb: %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32 @@ -284,15 +284,15 @@ bb: define amdgpu_kernel void @uniform_xor3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_xor3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_xor_b32 s0, s1, s0 ; GCN-NEXT: s_xnor_b32 s0, s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: global_store_dword v0, v1, s[4:5] +; GCN-NEXT: global_store_dword v0, v1, s[6:7] ; GCN-NEXT: s_endpgm bb: %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16 @@ -309,17 +309,17 @@ bb: define amdgpu_kernel void @uniform_xor3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_xor3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_xnor_b64 s[0:1], s[0:1], s[6:7] +; GCN-NEXT: s_xnor_b64 s[0:1], s[0:1], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GCN-NEXT: s_endpgm bb: %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll index 1feae4dae6a09e..85967f57178309 100644 --- a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll @@ -106,7 +106,7 @@ define amdgpu_kernel void @gds_global_align_plus_attr(ptr addrspace(1) %out) #0 define amdgpu_kernel void @gds_extern_align(ptr addrspace(1) %out, ptr addrspace(2) %gds.arg) #0 { ; GCN-LABEL: gds_extern_align: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x8 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x8 ; GCN-NEXT: v_mov_b32_e32 v0, 5 ; GCN-NEXT: s_movk_i32 m0, 0x401 ; GCN-NEXT: s_movk_i32 s1, 0x400 diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll index c8bbafbfa44d85..7bb70fff6d1bc0 100644 --- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll @@ -4,16 +4,16 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: IllegalGEPConst: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CHECK-NEXT: s_add_u32 s0, s4, s0 -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 -; CHECK-NEXT: s_addc_u32 s1, s5, s1 +; CHECK-NEXT: s_ashr_i32 s7, s6, 31 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; CHECK-NEXT: s_add_u32 s0, s0, s2 +; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll index 0f951e89d37c8a..6007dede902209 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll @@ -86,12 +86,12 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 { ; GCN: global_load_u8 v{{[0-9]+}}, ; WORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s15 -; NOWORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s2 +; NOWORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s4 ; GCN-NEXT: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off ; GCN: .amdhsa_kernel queue_ptr ; WORKAROUND: .amdhsa_user_sgpr_count 15 -; NOWORKAROUND: .amdhsa_user_sgpr_count 2 +; NOWORKAROUND: .amdhsa_user_sgpr_count 4 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 ; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 @@ -106,7 +106,7 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 { ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 -; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 4 define amdgpu_kernel void @queue_ptr() #1 { %queue.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 %load = load volatile i8, ptr addrspace(4) %queue.ptr @@ -120,16 +120,16 @@ define amdgpu_kernel void @queue_ptr() #1 { ; WORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s14 ; WORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s15 -; NOWORKAROUND: v_mov_b32_e32 [[V_X:v[0-9]+]], s6 -; NOWORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s7 -; NOWORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s8 +; NOWORKAROUND: v_mov_b32_e32 [[V_X:v[0-9]+]], s8 +; NOWORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s9 +; NOWORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s10 ; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1] ; GCN: global_load_u8 v{{[0-9]+}}, -; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[2:3] +; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[4:5] -; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_LO:[0-9]+]], s4 -; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_HI:[0-9]+]], s5 +; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_LO:[0-9]+]], s6 +; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_HI:[0-9]+]], s7 ; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_X]], off ; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_Y]], off @@ -138,7 +138,7 @@ define amdgpu_kernel void @queue_ptr() #1 { ; GCN: .amdhsa_kernel all_inputs ; WORKAROUND: .amdhsa_user_sgpr_count 13 -; NOWORKAROUND: .amdhsa_user_sgpr_count 6 +; NOWORKAROUND: .amdhsa_user_sgpr_count 8 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 ; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 @@ -153,7 +153,7 @@ define amdgpu_kernel void @queue_ptr() #1 { ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 13 -; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 8 define amdgpu_kernel void @all_inputs() #2 { %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll index 56b25bacd2defc..8f77a348422a2a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll @@ -9,16 +9,16 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s8, s[2:3], 0x0 +; GCN-NEXT: s_load_dword s8, s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s8 @@ -26,7 +26,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_mov_b32_e32 v5, v1 ; GCN-NEXT: v_add_f32_e32 v4, v5, v2 -; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[2:3] glc +; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[4:5] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v5 @@ -36,7 +36,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GCN-NEXT: ; %bb.3: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB0_4: ; %Flow2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: v_readfirstlane_b32 s0, v1 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, s0 @@ -56,10 +56,10 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index dcd366e779449e..ec4ea232e661cf 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -23043,7 +23043,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB92_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -23062,10 +23062,10 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB92_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -23085,7 +23085,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB92_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -23098,23 +23098,23 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; ; GFX10-LABEL: infer_as_before_atomic: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s5, exec_lo -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB92_3 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 @@ -23122,8 +23122,8 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execnz .LBB92_2 ; GFX10-NEXT: .LBB92_3: ; GFX10-NEXT: s_endpgm @@ -23134,10 +23134,10 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB92_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -23154,10 +23154,10 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX908-NEXT: s_cbranch_execz .LBB92_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 @@ -23174,10 +23174,10 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB92_3 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX8-NEXT: s_bcnt1_i32_b64 s5, s[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v4, s5 @@ -23204,25 +23204,25 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; ; GFX7-LABEL: infer_as_before_atomic: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b64 s[4:5], exec -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7-NEXT: s_cbranch_execz .LBB92_3 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_add_f32_e32 v0, v1, v2 @@ -23240,25 +23240,25 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; ; GFX6-LABEL: infer_as_before_atomic: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX6-NEXT: s_cbranch_execz .LBB92_3 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX6-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_add_f32_e32 v0, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll index 7fe068a445bf96..bd9aa0f5a454a3 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll @@ -15,10 +15,10 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll index 31069e567d86d4..845974340fd284 100644 --- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half8: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v4, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -18,7 +18,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half8: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -28,7 +28,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half8: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -74,7 +74,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half6: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v3, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -84,7 +84,7 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half6: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -94,7 +94,7 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half6: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -132,7 +132,7 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half4: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v2, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -144,7 +144,7 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half4: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half4: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -188,7 +188,7 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half2: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dword v1, v0, s[0:1] @@ -198,7 +198,7 @@ define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half2: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] @@ -208,7 +208,7 @@ define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll index e54cd64798a682..b9592a9ff9073a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -13,7 +13,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: test_move_load_address_to_vgpr: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v0, v1, s[0:1] glc @@ -54,7 +54,7 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: test_move_load_address_to_vgpr_d16_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v0, v1, s[0:1] glc diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index 16c09cf6000800..3e15b135eeab98 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -6,12 +6,12 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -19,12 +19,12 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_add_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -32,11 +32,11 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_add_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -50,14 +50,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 ; SI-NEXT: v_mov_b32_e32 v1, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -65,8 +65,8 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_add_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 ; VI-NEXT: s_addc_u32 s1, s1, -1 @@ -80,11 +80,11 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_add_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -98,39 +98,39 @@ entry: define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s5, 0x8ca0 +; SI-NEXT: s_mov_b32 s4, 0x8ca0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_atomic_add v0, off, s[0:3], s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_atomic_add v0, off, s[0:3], s4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s5, 0x8ca0 +; VI-NEXT: s_mov_b32 s4, 0x8ca0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_atomic_add v0, off, s[0:3], s5 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: buffer_atomic_add v0, off, s[0:3], s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_add_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -144,8 +144,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac @@ -159,8 +159,8 @@ define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_add_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 0xdeac ; VI-NEXT: s_addc_u32 s1, s1, 0xabcd @@ -174,13 +174,13 @@ define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_add_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac ; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -195,51 +195,51 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_add_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_add_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_add v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -251,14 +251,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 offset:16 @@ -268,18 +268,18 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_add_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -287,14 +287,14 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_add_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -310,65 +310,65 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_add_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -381,12 +381,12 @@ entry: define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -394,12 +394,12 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_add_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -407,11 +407,11 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_add_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -424,51 +424,51 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_add_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 glc +; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_add_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_add v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -479,14 +479,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 @@ -496,16 +496,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_add_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -513,14 +513,14 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_add_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -535,63 +535,63 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_add_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -603,12 +603,12 @@ entry: define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_and_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -616,12 +616,12 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_and_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -629,11 +629,11 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_and_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -647,51 +647,51 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_and_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_and_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_and_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_and v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -703,14 +703,14 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[0:3], 0 addr64 offset:16 @@ -720,18 +720,18 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_and_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_and v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -739,14 +739,14 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_and_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -762,65 +762,65 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_and v2, v[0:1], s[0:3], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_and_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_and_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -833,12 +833,12 @@ entry: define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_and_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -846,12 +846,12 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_and_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -859,11 +859,11 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_and_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -876,51 +876,51 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_and_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 glc +; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_and_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_and_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_and v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -931,14 +931,14 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[0:3], 0 addr64 @@ -948,16 +948,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_and_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_and v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -965,14 +965,14 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_and_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -987,63 +987,63 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_and v2, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_and_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_and_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -1055,12 +1055,12 @@ entry: define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_sub_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -1068,12 +1068,12 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_sub_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1081,11 +1081,11 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_sub_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1099,51 +1099,51 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_sub_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_sub_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_sub_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -1155,14 +1155,14 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 addr64 offset:16 @@ -1172,18 +1172,18 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_sub_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_sub v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1191,14 +1191,14 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_sub_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1214,65 +1214,65 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_sub_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_sub_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -1285,12 +1285,12 @@ entry: define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_sub_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -1298,12 +1298,12 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_sub_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1311,11 +1311,11 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_sub_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1328,51 +1328,51 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_sub_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 glc +; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_sub_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_sub_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -1383,14 +1383,14 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 addr64 @@ -1400,16 +1400,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_sub_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_sub v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1417,14 +1417,14 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_sub_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1439,63 +1439,63 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_sub_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_sub_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -1507,12 +1507,12 @@ entry: define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_max_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -1520,12 +1520,12 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_max_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1533,11 +1533,11 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_max_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1551,48 +1551,48 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_max_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -1604,14 +1604,14 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[0:3], 0 addr64 offset:16 @@ -1619,31 +1619,31 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm @@ -1657,62 +1657,62 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[4:7], 0 addr64 offset:16 glc -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -1725,33 +1725,33 @@ entry: define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_max_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -1762,48 +1762,48 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_max_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 glc -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 glc +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 glc -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -1814,14 +1814,14 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[0:3], 0 addr64 @@ -1829,29 +1829,29 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -1864,60 +1864,60 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[4:7], 0 addr64 glc -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -1929,33 +1929,33 @@ entry: define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umax_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -1967,48 +1967,48 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umax_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2020,14 +2020,14 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[0:3], 0 addr64 offset:16 @@ -2035,31 +2035,31 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm @@ -2073,62 +2073,62 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[4:7], 0 addr64 offset:16 glc -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -2141,33 +2141,33 @@ entry: define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umax_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -2178,48 +2178,48 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umax_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 glc -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 glc +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 glc -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2230,14 +2230,14 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[0:3], 0 addr64 @@ -2245,29 +2245,29 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umax_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -2280,60 +2280,60 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[4:7], 0 addr64 glc -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -2345,33 +2345,33 @@ entry: define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -2383,48 +2383,48 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_min_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2436,14 +2436,14 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[0:3], 0 addr64 offset:16 @@ -2451,31 +2451,31 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm @@ -2489,62 +2489,62 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[4:7], 0 addr64 offset:16 glc -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -2557,33 +2557,33 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -2594,48 +2594,48 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_min_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 glc -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 glc +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 glc -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2646,14 +2646,14 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[0:3], 0 addr64 @@ -2661,29 +2661,29 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_min_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -2696,60 +2696,60 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[4:7], 0 addr64 glc -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -2761,33 +2761,33 @@ entry: define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umin_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umin_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -2799,48 +2799,48 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umin_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umin_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2852,14 +2852,14 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[0:3], 0 addr64 offset:16 @@ -2867,31 +2867,31 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umin_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_umin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm @@ -2905,62 +2905,62 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[4:7], 0 addr64 offset:16 glc -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umin_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -2973,33 +2973,33 @@ entry: define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umin_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umin_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -3010,48 +3010,48 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umin_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 glc -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 glc +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umin_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 glc -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -3062,14 +3062,14 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[0:3], 0 addr64 @@ -3077,29 +3077,29 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umin_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_umin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -3112,60 +3112,60 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[4:7], 0 addr64 glc -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umin_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -3177,12 +3177,12 @@ entry: define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_or_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -3190,12 +3190,12 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_or_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3203,11 +3203,11 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_or_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3221,51 +3221,51 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_or_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_or_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_or_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_or v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -3277,14 +3277,14 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[0:3], 0 addr64 offset:16 @@ -3294,18 +3294,18 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; ; VI-LABEL: atomic_or_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3313,14 +3313,14 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; ; GFX9-LABEL: atomic_or_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3336,65 +3336,65 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_or v2, v[0:1], s[0:3], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_or_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_or_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -3407,12 +3407,12 @@ entry: define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_or_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -3420,12 +3420,12 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_or_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3433,11 +3433,11 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_or_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3450,51 +3450,51 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_or_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 glc +; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_or_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_or_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_or v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -3505,14 +3505,14 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[0:3], 0 addr64 @@ -3522,16 +3522,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_or_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3539,14 +3539,14 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_or_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3561,63 +3561,63 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_or v2, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_or_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_or_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -3629,12 +3629,12 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xchg_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -3642,12 +3642,12 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_xchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3655,11 +3655,11 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_xchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3673,12 +3673,12 @@ entry: define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %in) { ; SI-LABEL: atomic_xchg_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -3686,12 +3686,12 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; ; VI-LABEL: atomic_xchg_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3699,11 +3699,11 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; ; GFX9-LABEL: atomic_xchg_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3717,51 +3717,51 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xchg_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xchg_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -3773,14 +3773,14 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[0:3], 0 addr64 offset:16 @@ -3790,18 +3790,18 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_xchg_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_swap v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3809,14 +3809,14 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_xchg_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3832,65 +3832,65 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[0:3], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xchg_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -3903,12 +3903,12 @@ entry: define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xchg_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -3916,12 +3916,12 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_xchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3929,11 +3929,11 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_xchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3946,51 +3946,51 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xchg_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xchg_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -4001,14 +4001,14 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[0:3], 0 addr64 @@ -4018,16 +4018,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_xchg_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_swap v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4035,14 +4035,14 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_xchg_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4057,63 +4057,63 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xchg_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -4125,7 +4125,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4140,7 +4140,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4155,12 +4155,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4173,54 +4173,54 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v2, v0, s[6:7] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -4233,16 +4233,16 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s7, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s9, s[4:5], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64 offset:16 @@ -4252,19 +4252,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_cmpxchg_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dword s6, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4273,17 +4273,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x3c +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4298,71 +4298,71 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s10, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s10, s[4:5], 0x11 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s9, s[4:5], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_load_dword s9, s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 +; GFX9-NEXT: s_load_dword s9, s[4:5], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v2, v0, s[6:7] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -4376,7 +4376,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4391,7 +4391,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; VI-LABEL: atomic_cmpxchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4406,12 +4406,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; GFX9-LABEL: atomic_cmpxchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4423,54 +4423,54 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v2, v0, s[6:7] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr addrspace(1) %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -4482,16 +4482,16 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s7, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s9, s[4:5], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64 @@ -4501,17 +4501,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dword s6, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4520,17 +4520,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x3c +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4544,69 +4544,69 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s10, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s10, s[4:5], 0x11 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s9, s[4:5], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_load_dword s9, s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 +; GFX9-NEXT: s_load_dword s9, s[4:5], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v2, v0, s[6:7] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -4619,12 +4619,12 @@ entry: define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xor_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -4632,12 +4632,12 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_xor_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4645,11 +4645,11 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_xor_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4663,51 +4663,51 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xor_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xor_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xor_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -4719,14 +4719,14 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[0:3], 0 addr64 offset:16 @@ -4736,18 +4736,18 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_xor_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_xor v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4755,14 +4755,14 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_xor_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4778,65 +4778,65 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[0:3], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xor_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xor_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -4849,12 +4849,12 @@ entry: define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xor_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -4862,12 +4862,12 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_xor_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4875,11 +4875,11 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_xor_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4892,51 +4892,51 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xor_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 glc +; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xor_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xor_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -4947,14 +4947,14 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[0:3], 0 addr64 @@ -4964,16 +4964,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_xor_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_xor v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4981,14 +4981,14 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_xor_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5003,63 +5003,63 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xor_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xor_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -5071,7 +5071,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5087,7 +5087,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5105,13 +5105,13 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:16 glc +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %in, i64 4 @@ -5123,7 +5123,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -5141,7 +5141,7 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_i32_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5159,13 +5159,13 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i32_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:-512 glc +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:-512 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %in, i64 -128 @@ -5177,7 +5177,7 @@ entry: define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5193,7 +5193,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5211,13 +5211,13 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:16 glc +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr float, ptr addrspace(1) %in, i64 4 @@ -5229,7 +5229,7 @@ entry: define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5245,7 +5245,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; VI-LABEL: atomic_load_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5261,13 +5261,13 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX9-LABEL: atomic_load_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4 @@ -5278,32 +5278,30 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_load_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s0, s4 ; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -5313,6 +5311,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; VI-NEXT: flat_load_dword v0, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -5320,17 +5320,17 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %in, i64 %index @@ -5343,32 +5343,30 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_load_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s0, s4 ; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -5376,6 +5374,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; VI-NEXT: flat_load_dword v0, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -5383,17 +5383,17 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %in, i64 %index @@ -5405,32 +5405,30 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_f32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_load_f32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s0, s4 ; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -5440,6 +5438,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; VI-NEXT: flat_load_dword v0, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -5447,17 +5447,17 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_f32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr float, ptr addrspace(1) %in, i64 %index @@ -5470,19 +5470,19 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 @@ -5494,11 +5494,11 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5510,19 +5510,19 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -5532,11 +5532,11 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; ; GFX9-LABEL: atomic_store_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5547,19 +5547,19 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -5569,11 +5569,11 @@ define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; ; GFX9-LABEL: atomic_store_f32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5584,44 +5584,44 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s2, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:16 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dword s4, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5634,44 +5634,44 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_f32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s2, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:16 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_f32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dword s4, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5684,43 +5684,43 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s8, s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dword s8, s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b64 s[0:1], s[4:5] +; SI-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dword s4, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5732,43 +5732,43 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_f32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s8, s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dword s8, s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b64 s[0:1], s[4:5] +; SI-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_f32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dword s4, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5780,7 +5780,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i8_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5796,7 +5796,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; VI-LABEL: atomic_load_i8_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5812,13 +5812,13 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; GFX9-LABEL: atomic_load_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:16 glc +; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %in, i64 16 @@ -5830,7 +5830,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i8_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -5848,7 +5848,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; VI-LABEL: atomic_load_i8_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5866,13 +5866,13 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; GFX9-LABEL: atomic_load_i8_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:-512 glc +; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:-512 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %in, i64 -512 @@ -5884,19 +5884,19 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i8_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i8_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 @@ -5908,11 +5908,11 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) ; ; GFX9-LABEL: atomic_store_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5924,19 +5924,19 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -5946,11 +5946,11 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; ; GFX9-LABEL: atomic_store_i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5961,7 +5961,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5977,7 +5977,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5993,13 +5993,13 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_short v0, v1, s[6:7] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %in, i64 8 @@ -6011,7 +6011,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i16_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -6029,7 +6029,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_i16_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6047,13 +6047,13 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i16_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_short v0, v1, s[6:7] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %in, i64 -256 @@ -6065,19 +6065,19 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 @@ -6089,11 +6089,11 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -6105,19 +6105,19 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -6127,11 +6127,11 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; ; GFX9-LABEL: atomic_store_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -6142,19 +6142,19 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_f16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 @@ -6166,11 +6166,11 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_store_f16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -6182,19 +6182,19 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -6204,11 +6204,11 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; ; GFX9-LABEL: atomic_store_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -6219,19 +6219,19 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_bf16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_bf16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 @@ -6243,11 +6243,11 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) ; ; GFX9-LABEL: atomic_store_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %out, i64 8 @@ -6258,19 +6258,19 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_bf16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -6280,11 +6280,11 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) ; ; GFX9-LABEL: atomic_store_bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm store atomic bfloat %in, ptr addrspace(1) %out seq_cst, align 2 @@ -6294,12 +6294,12 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -6307,12 +6307,12 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_inc_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6320,11 +6320,11 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_inc_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6338,14 +6338,14 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 ; SI-NEXT: v_mov_b32_e32 v1, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -6353,8 +6353,8 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 ; VI-NEXT: s_addc_u32 s1, s1, -1 @@ -6368,11 +6368,11 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_inc_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6386,39 +6386,39 @@ entry: define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s5, 0x8ca0 +; SI-NEXT: s_mov_b32 s4, 0x8ca0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_atomic_inc v0, off, s[0:3], s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_atomic_inc v0, off, s[0:3], s4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s5, 0x8ca0 +; VI-NEXT: s_mov_b32 s4, 0x8ca0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], s5 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6432,8 +6432,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac @@ -6447,8 +6447,8 @@ define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_inc_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 0xdeac ; VI-NEXT: s_addc_u32 s1, s1, 0xabcd @@ -6462,13 +6462,13 @@ define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_inc_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac ; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6482,51 +6482,51 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_inc_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -6538,14 +6538,14 @@ entry: define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_inc_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 offset:16 @@ -6555,18 +6555,18 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_inc_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6574,14 +6574,14 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_inc_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6597,65 +6597,65 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_inc_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -6668,12 +6668,12 @@ entry: define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -6681,12 +6681,12 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_dec_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6694,11 +6694,11 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_dec_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6712,14 +6712,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 ; SI-NEXT: v_mov_b32_e32 v1, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 @@ -6727,8 +6727,8 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_dec_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 ; VI-NEXT: s_addc_u32 s1, s1, -1 @@ -6742,11 +6742,11 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_dec_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6760,39 +6760,39 @@ entry: define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s5, 0x8ca0 +; SI-NEXT: s_mov_b32 s4, 0x8ca0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_atomic_dec v0, off, s[0:3], s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_atomic_dec v0, off, s[0:3], s4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s5, 0x8ca0 +; VI-NEXT: s_mov_b32 s4, 0x8ca0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], s5 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_dec_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6806,8 +6806,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac @@ -6821,8 +6821,8 @@ define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_dec_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 0xdeac ; VI-NEXT: s_addc_u32 s1, s1, 0xabcd @@ -6836,13 +6836,13 @@ define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_dec_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac ; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6856,51 +6856,51 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_dec_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_dec_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -6912,14 +6912,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_dec_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 offset:16 @@ -6929,18 +6929,18 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_dec_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6948,14 +6948,14 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_dec_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6971,65 +6971,65 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_dec_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_dec_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index @@ -7042,7 +7042,7 @@ entry: define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7058,7 +7058,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_f16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7074,13 +7074,13 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_short v0, v1, s[6:7] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %gep = getelementptr half, ptr addrspace(1) %in, i64 8 %val = load atomic half, ptr addrspace(1) %gep seq_cst, align 2 @@ -7091,7 +7091,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f16_negoffset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -7109,7 +7109,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_f16_negoffset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7127,13 +7127,13 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_f16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_short v0, v1, s[6:7] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %gep = getelementptr half, ptr addrspace(1) %in, i64 -256 %val = load atomic half, ptr addrspace(1) %gep seq_cst, align 2 @@ -7144,7 +7144,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_bf16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7160,7 +7160,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; VI-LABEL: atomic_load_bf16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7176,13 +7176,13 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; GFX9-LABEL: atomic_load_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_short v0, v1, s[6:7] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 8 %val = load atomic bfloat, ptr addrspace(1) %gep seq_cst, align 2 @@ -7193,7 +7193,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_bf16_negoffset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -7211,7 +7211,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; VI-LABEL: atomic_load_bf16_negoffset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7229,13 +7229,13 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; GFX9-LABEL: atomic_load_bf16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_short v0, v1, s[6:7] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 -256 %val = load atomic bfloat, ptr addrspace(1) %gep seq_cst, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index c7fa2a2ede3887..f7882e6f120222 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -4616,7 +4616,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -4648,7 +4648,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -4679,28 +4679,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s7, 31 -; GFX9-NEXT: s_mov_b32 s0, s7 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 +; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -4714,8 +4714,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -4753,26 +4753,26 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s7, s5, 31 -; VI-NEXT: s_mov_b32 s6, s5 -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dword s5, s[6:7], 0x10 -; VI-NEXT: s_add_u32 s6, s6, 16 -; VI-NEXT: s_addc_u32 s7, s7, 0 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_ashr_i32 s5, s7, 31 +; VI-NEXT: s_mov_b32 s4, s7 +; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s7, s[4:5], 0x10 +; VI-NEXT: s_add_u32 s4, s4, 16 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB92_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_max_i32_e32 v2, s4, v3 +; VI-NEXT: v_max_i32_e32 v2, s6, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4789,24 +4789,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX9-NEXT: s_ashr_i32 s5, s7, 31 +; GFX9-NEXT: s_mov_b32 s4, s7 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc +; GFX9-NEXT: v_max_i32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -4816,7 +4816,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index @@ -4829,7 +4829,7 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -4861,7 +4861,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -4890,28 +4890,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s7, 31 -; GFX9-NEXT: s_mov_b32 s0, s7 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 +; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB93_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -4924,8 +4924,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -4963,24 +4963,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s7, s5, 31 -; VI-NEXT: s_mov_b32 s6, s5 -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dword s5, s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_ashr_i32 s5, s7, 31 +; VI-NEXT: s_mov_b32 s4, s7 +; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s7, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: .LBB94_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_max_i32_e32 v2, s4, v3 +; VI-NEXT: v_max_i32_e32 v2, s6, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4997,24 +4997,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_ashr_i32 s5, s7, 31 +; GFX9-NEXT: s_mov_b32 s4, s7 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc +; GFX9-NEXT: v_max_i32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -5024,7 +5024,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index @@ -5869,7 +5869,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -5901,7 +5901,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -5932,28 +5932,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s7, 31 -; GFX9-NEXT: s_mov_b32 s0, s7 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_u32_e32 v0, s6, v1 +; GFX9-NEXT: v_max_u32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB105_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -5967,8 +5967,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -6006,26 +6006,26 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s7, s5, 31 -; VI-NEXT: s_mov_b32 s6, s5 -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dword s5, s[6:7], 0x10 -; VI-NEXT: s_add_u32 s6, s6, 16 -; VI-NEXT: s_addc_u32 s7, s7, 0 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_ashr_i32 s5, s7, 31 +; VI-NEXT: s_mov_b32 s4, s7 +; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s7, s[4:5], 0x10 +; VI-NEXT: s_add_u32 s4, s4, 16 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB106_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_max_u32_e32 v2, s4, v3 +; VI-NEXT: v_max_u32_e32 v2, s6, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6042,24 +6042,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX9-NEXT: s_ashr_i32 s5, s7, 31 +; GFX9-NEXT: s_mov_b32 s4, s7 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_u32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc +; GFX9-NEXT: v_max_u32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -6069,7 +6069,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index @@ -6082,8 +6082,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -6121,24 +6121,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s7, s5, 31 -; VI-NEXT: s_mov_b32 s6, s5 -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dword s5, s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_ashr_i32 s5, s7, 31 +; VI-NEXT: s_mov_b32 s4, s7 +; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s7, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: .LBB107_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_max_u32_e32 v2, s4, v3 +; VI-NEXT: v_max_u32_e32 v2, s6, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6155,24 +6155,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_ashr_i32 s5, s7, 31 +; GFX9-NEXT: s_mov_b32 s4, s7 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_u32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc +; GFX9-NEXT: v_max_u32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -6182,7 +6182,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index @@ -7860,7 +7860,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -7892,7 +7892,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -7923,28 +7923,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s7, 31 -; GFX9-NEXT: s_mov_b32 s0, s7 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_mov_b32 s4, s3 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 +; GFX9-NEXT: v_min_i32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -7958,8 +7958,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -7997,26 +7997,26 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s7, s5, 31 -; VI-NEXT: s_mov_b32 s6, s5 -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dword s5, s[6:7], 0x10 -; VI-NEXT: s_add_u32 s6, s6, 16 -; VI-NEXT: s_addc_u32 s7, s7, 0 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_ashr_i32 s5, s7, 31 +; VI-NEXT: s_mov_b32 s4, s7 +; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s7, s[4:5], 0x10 +; VI-NEXT: s_add_u32 s4, s4, 16 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB129_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_min_i32_e32 v2, s4, v3 +; VI-NEXT: v_min_i32_e32 v2, s6, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -8033,24 +8033,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX9-NEXT: s_ashr_i32 s5, s7, 31 +; GFX9-NEXT: s_mov_b32 s4, s7 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc +; GFX9-NEXT: v_min_i32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -8060,7 +8060,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index @@ -8073,8 +8073,8 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[0:1], 0x0 ; SI-NEXT: s_mov_b64 s[4:5], 0 @@ -8101,13 +8101,13 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s3, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_load_dword s3, s[6:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: .LBB130_1: ; %atomicrmw.start @@ -8126,17 +8126,17 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_min_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 +; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -8155,8 +8155,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -8194,24 +8194,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s7, s5, 31 -; VI-NEXT: s_mov_b32 s6, s5 -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dword s5, s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_ashr_i32 s5, s7, 31 +; VI-NEXT: s_mov_b32 s4, s7 +; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; VI-NEXT: s_add_u32 s4, s0, s4 +; VI-NEXT: s_addc_u32 s5, s1, s5 +; VI-NEXT: s_load_dword s7, s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: .LBB131_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_min_i32_e32 v2, s4, v3 +; VI-NEXT: v_min_i32_e32 v2, s6, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -8228,24 +8228,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_ashr_i32 s5, s7, 31 +; GFX9-NEXT: s_mov_b32 s4, s7 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc +; GFX9-NEXT: v_min_i32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -8255,7 +8255,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index 24c08ec86051f8..b1c68a06a818f9 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_add_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -20,7 +20,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_add_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -33,19 +33,19 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_add_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -62,64 +62,64 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_add_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_add_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -135,34 +135,34 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -171,15 +171,15 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_add_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -188,14 +188,14 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_add_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -210,7 +210,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -231,7 +231,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_add_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -254,23 +254,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -293,7 +293,7 @@ entry: define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_add_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -308,7 +308,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_add_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -323,19 +323,19 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_add_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -351,64 +351,64 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_add_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 glc +; CI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_add_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -423,32 +423,32 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -457,15 +457,15 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_add_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -474,14 +474,14 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_add_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -495,7 +495,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -516,7 +516,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -537,23 +537,23 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -575,7 +575,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_and_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -588,7 +588,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_and_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -601,19 +601,19 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_and_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -630,64 +630,64 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_and_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_and_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_and_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -703,34 +703,34 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_and_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -739,15 +739,15 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_and_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -756,14 +756,14 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_and_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -778,7 +778,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -799,7 +799,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_and_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -822,23 +822,23 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -861,7 +861,7 @@ entry: define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_and_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -876,7 +876,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_and_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -891,19 +891,19 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_and_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -919,64 +919,64 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_and_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 glc +; CI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_and_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_and_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -991,32 +991,32 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_and_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1025,15 +1025,15 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_and_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1042,14 +1042,14 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_and_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1063,7 +1063,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1084,7 +1084,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1105,23 +1105,23 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1143,7 +1143,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_sub_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1156,7 +1156,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_sub_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1169,19 +1169,19 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_sub_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1198,64 +1198,64 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_sub_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_sub_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_sub_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1271,34 +1271,34 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_sub_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1307,15 +1307,15 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_sub_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1324,14 +1324,14 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_sub_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1346,7 +1346,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1367,7 +1367,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_sub_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1390,23 +1390,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1429,7 +1429,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_sub_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1444,7 +1444,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_sub_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1459,19 +1459,19 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_sub_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1487,64 +1487,64 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_sub_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 glc +; CI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_sub_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_sub_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1559,32 +1559,32 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_sub_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1593,15 +1593,15 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_sub_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1610,14 +1610,14 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_sub_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1631,7 +1631,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1652,7 +1652,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1673,23 +1673,23 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1711,7 +1711,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_max_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1722,7 +1722,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_max_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1733,17 +1733,17 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_max_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1760,61 +1760,61 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_max_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -1830,61 +1830,61 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -1899,7 +1899,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1919,7 +1919,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1941,22 +1941,22 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1979,7 +1979,7 @@ entry: define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_max_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1992,7 +1992,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_max_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2005,17 +2005,17 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_max_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2031,61 +2031,61 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_max_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 glc -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 glc +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 glc -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2100,59 +2100,59 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_max_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2166,7 +2166,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2186,7 +2186,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2206,22 +2206,22 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2243,7 +2243,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umax_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2254,7 +2254,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umax_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2265,17 +2265,17 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umax_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2292,61 +2292,61 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umax_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2362,61 +2362,61 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2431,7 +2431,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2451,7 +2451,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2473,22 +2473,22 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2511,7 +2511,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umax_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2524,7 +2524,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_umax_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2537,17 +2537,17 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_umax_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2563,61 +2563,61 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umax_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 glc -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 glc +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 glc -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2632,59 +2632,59 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umax_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2698,7 +2698,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2718,7 +2718,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2738,22 +2738,22 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2775,7 +2775,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_min_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2786,7 +2786,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_min_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2797,17 +2797,17 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_min_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2824,61 +2824,61 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_min_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2894,61 +2894,61 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2963,7 +2963,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2983,7 +2983,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3005,22 +3005,22 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3043,7 +3043,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_min_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3056,7 +3056,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_min_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3069,17 +3069,17 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_min_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3095,61 +3095,61 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_min_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 glc -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 glc +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 glc -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -3164,59 +3164,59 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -3230,7 +3230,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3250,7 +3250,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3270,22 +3270,22 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3307,7 +3307,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umin_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3318,7 +3318,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umin_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3329,17 +3329,17 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umin_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3356,61 +3356,61 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umin_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umin_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -3426,61 +3426,61 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umin_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -3495,7 +3495,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3515,7 +3515,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umin_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3537,22 +3537,22 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3575,7 +3575,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umin_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3588,7 +3588,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_umin_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3601,17 +3601,17 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_umin_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3627,61 +3627,61 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umin_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 glc -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 glc +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umin_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 glc -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 glc +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -3696,59 +3696,59 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_umin_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -3762,7 +3762,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3782,7 +3782,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3802,22 +3802,22 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3839,7 +3839,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_or_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3852,7 +3852,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_or_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3865,19 +3865,19 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_or_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3894,64 +3894,64 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_or_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_or_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_or_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -3967,34 +3967,34 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_or_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4003,15 +4003,15 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; ; GFX9-LABEL: atomic_or_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4020,14 +4020,14 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; GFX12-LABEL: atomic_or_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -4042,7 +4042,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4063,7 +4063,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; ; VI-LABEL: atomic_or_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4086,23 +4086,23 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX9-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4125,7 +4125,7 @@ entry: define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_or_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4140,7 +4140,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_or_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4155,19 +4155,19 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_or_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4183,64 +4183,64 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_or_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 glc +; CI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_or_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_or_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -4255,32 +4255,32 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_or_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4289,15 +4289,15 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_or_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4306,14 +4306,14 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_or_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -4327,7 +4327,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4348,7 +4348,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4369,23 +4369,23 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4407,7 +4407,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xchg_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4420,7 +4420,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_xchg_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4433,19 +4433,19 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_xchg_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4462,7 +4462,7 @@ entry: define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double %in) { ; CI-LABEL: atomic_xchg_f64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4475,7 +4475,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; ; VI-LABEL: atomic_xchg_f64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4488,19 +4488,19 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; ; GFX9-LABEL: atomic_xchg_f64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4517,7 +4517,7 @@ entry: define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr %in) { ; CI-LABEL: atomic_xchg_pointer_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4530,7 +4530,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_pointer_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4543,19 +4543,19 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_pointer_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4572,64 +4572,64 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xchg_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xchg_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -4645,34 +4645,34 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xchg_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4681,15 +4681,15 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_xchg_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4698,14 +4698,14 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_xchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -4720,7 +4720,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4741,7 +4741,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_xchg_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4764,23 +4764,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4803,7 +4803,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xchg_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4818,7 +4818,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_xchg_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4833,19 +4833,19 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_xchg_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4861,64 +4861,64 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xchg_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 glc +; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xchg_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -4933,32 +4933,32 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xchg_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4967,15 +4967,15 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_xchg_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4984,14 +4984,14 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_xchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -5005,7 +5005,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5026,7 +5026,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5047,23 +5047,23 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5085,7 +5085,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xor_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -5098,7 +5098,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_xor_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5111,19 +5111,19 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_xor_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -5140,64 +5140,64 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xor_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xor_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xor_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -5213,34 +5213,34 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xor_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5249,15 +5249,15 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_xor_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5266,14 +5266,14 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_xor_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -5288,7 +5288,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5309,7 +5309,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_xor_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5332,23 +5332,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5371,7 +5371,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xor_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5386,7 +5386,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_xor_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5401,19 +5401,19 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_xor_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -5429,64 +5429,64 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xor_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 glc +; CI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xor_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 glc +; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xor_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -5501,32 +5501,32 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xor_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5535,15 +5535,15 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_xor_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5552,14 +5552,14 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_xor_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -5573,7 +5573,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5594,7 +5594,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5615,23 +5615,23 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5653,51 +5653,51 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v2, s8 +; CI-NEXT: v_mov_b32_e32 v3, s9 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -5705,13 +5705,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 ; GFX12-LABEL: atomic_cmpxchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5724,53 +5724,53 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_soffset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, 0x11940 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], s2 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_mov_b32 s0, 0x11940 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v2, s8 +; CI-NEXT: v_mov_b32_e32 v3, s9 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], s0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, 0x11940 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], s2 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, 0x11940 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], s0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x11000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] offset:2368 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] offset:2368 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -5778,13 +5778,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 ; GFX12-LABEL: atomic_cmpxchg_i64_soffset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000 scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:72000 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -5797,7 +5797,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5817,7 +5817,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5837,22 +5837,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[8:9] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -5873,7 +5873,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5892,7 +5892,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_cmpxchg_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 @@ -5912,16 +5912,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[12:13], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5929,7 +5929,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 @@ -5951,42 +5951,42 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; CI-NEXT: s_mov_b32 s15, 0xf000 -; CI-NEXT: s_mov_b32 s14, -1 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: s_mov_b32 s12, s6 -; CI-NEXT: s_mov_b32 s13, s7 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s15 -; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc +; CI-NEXT: s_lshl_b64 s[6:7], s[14:15], 3 +; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: s_mov_b32 s0, s10 +; CI-NEXT: s_mov_b32 s1, s11 +; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_mov_b32 s11, s3 +; CI-NEXT: v_mov_b32_e32 v0, s12 +; CI-NEXT: v_mov_b32_e32 v1, s13 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_mov_b32_e32 v5, s7 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; VI-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_u32 s0, s4, s2 -; VI-NEXT: s_addc_u32 s3, s5, s3 +; VI-NEXT: s_add_u32 s0, s8, s2 +; VI-NEXT: s_addc_u32 s3, s9, s3 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -5994,46 +5994,46 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s0, s10 +; VI-NEXT: s_mov_b32 s1, s11 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s2, s8, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s3, s9, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s13 +; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[8:9], s[0:1] ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7] +; GFX12-NEXT: global_store_b64 v4, v[0:1], s[10:11] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -6047,51 +6047,51 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v2, s8 +; CI-NEXT: v_mov_b32_e32 v3, s9 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6099,13 +6099,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6 ; GFX12-LABEL: atomic_cmpxchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm @@ -6117,7 +6117,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6137,7 +6137,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: atomic_cmpxchg_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6157,22 +6157,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[8:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -6192,7 +6192,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6211,7 +6211,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; VI-LABEL: atomic_cmpxchg_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 @@ -6229,16 +6229,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; GFX9-LABEL: atomic_cmpxchg_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[12:13], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6246,7 +6246,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 @@ -6267,39 +6267,39 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; CI-NEXT: s_mov_b32 s15, 0xf000 -; CI-NEXT: s_mov_b32 s14, -1 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: s_mov_b32 s12, s6 -; CI-NEXT: s_mov_b32 s13, s7 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s15 -; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; CI-NEXT: s_lshl_b64 s[6:7], s[14:15], 3 +; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: s_mov_b32 s0, s10 +; CI-NEXT: s_mov_b32 s1, s11 +; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_mov_b32 s11, s3 +; CI-NEXT: v_mov_b32_e32 v0, s12 +; CI-NEXT: v_mov_b32_e32 v1, s13 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_mov_b32_e32 v5, s7 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; VI-NEXT: s_add_u32 s2, s4, s2 -; VI-NEXT: s_addc_u32 s3, s5, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; VI-NEXT: s_add_u32 s2, s8, s2 +; VI-NEXT: s_addc_u32 s3, s9, s3 ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v4, s2 @@ -6308,46 +6308,46 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s0, s10 +; VI-NEXT: s_mov_b32 s1, s11 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s2, s8, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s3, s9, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s13 +; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[8:9], s[0:1] ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7] +; GFX12-NEXT: global_store_b64 v4, v[0:1], s[10:11] ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -6360,7 +6360,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_load_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6376,7 +6376,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6394,18 +6394,18 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_load_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS @@ -6423,7 +6423,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_load_i64_neg_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: v_not_b32_e32 v0, 31 ; CI-NEXT: v_mov_b32_e32 v1, -1 @@ -6441,7 +6441,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; ; VI-LABEL: atomic_load_i64_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6459,18 +6459,18 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; ; GFX9-LABEL: atomic_load_i64_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:-32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:-32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_load_i64_neg_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 scope:SCOPE_SYS @@ -6488,7 +6488,7 @@ entry: define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_load_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6504,7 +6504,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; ; VI-LABEL: atomic_load_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6520,18 +6520,18 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX9-LABEL: atomic_load_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_load_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_DEV @@ -6548,32 +6548,30 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_load_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 +; CI-NEXT: s_mov_b32 s4, s2 ; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: s_mov_b32 s1, s7 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_mov_b32 s3, s7 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:32 glc +; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_load_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 ; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 32 @@ -6583,6 +6581,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -6590,27 +6590,27 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_load_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS @@ -6629,32 +6629,30 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_load_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_mov_b32 s9, s7 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 glc +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_load_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 ; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -6662,6 +6660,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -6669,27 +6669,27 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_load_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_SYS @@ -6707,32 +6707,30 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_load_f64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 +; CI-NEXT: s_mov_b32 s4, s2 ; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: s_mov_b32 s1, s7 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_mov_b32 s5, s3 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_mov_b32 s3, s7 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:32 glc +; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_load_f64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 ; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 32 @@ -6742,6 +6740,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -6749,27 +6749,27 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_f64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_load_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS @@ -6788,7 +6788,7 @@ entry: define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_store_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6801,7 +6801,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s2, 32 @@ -6814,17 +6814,17 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_store_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -6840,7 +6840,7 @@ entry: define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_store_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6853,7 +6853,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -6864,17 +6864,17 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; ; GFX9-LABEL: atomic_store_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_store_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -6889,62 +6889,62 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_store_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: s_lshl_b64 s[4:5], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s6, s0 -; VI-NEXT: s_addc_u32 s1, s7, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_store_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm @@ -6958,60 +6958,60 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_store_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[8:11], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s6, s0 -; VI-NEXT: s_addc_u32 s1, s7, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_store_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm @@ -7024,62 +7024,62 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_store_f64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: s_lshl_b64 s[4:5], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_f64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s6, s0 -; VI-NEXT: s_addc_u32 s1, s7, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_store_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm @@ -7093,7 +7093,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_inc_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -7106,7 +7106,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_inc_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -7119,19 +7119,19 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_inc_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -7148,64 +7148,64 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_inc_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -7221,34 +7221,34 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_inc_i64_incr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_inc_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_i64_incr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7257,15 +7257,15 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_inc_i64_incr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7274,14 +7274,14 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_inc_i64_incr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -7296,7 +7296,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_dec_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -7309,7 +7309,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_dec_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -7322,19 +7322,19 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_dec_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -7351,64 +7351,64 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_dec_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: s_mov_b32 s4, s2 +; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_dec_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -7424,34 +7424,34 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_dec_i64_decr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_atomic_dec_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_i64_decr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; VI-NEXT: s_add_u32 s0, s4, s0 -; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -7460,15 +7460,15 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_dec_i64_decr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7477,14 +7477,14 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_dec_i64_decr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 9af7e0978f9dbe..59a99a6a0328d4 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -4866,10 +4866,10 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 ; SI-NEXT: s_addc_u32 s5, s1, s5 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 @@ -4905,8 +4905,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -4941,23 +4941,23 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc @@ -4965,9 +4965,9 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB88_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -4981,7 +4981,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -5025,7 +5025,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -5064,16 +5064,16 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -5081,7 +5081,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc @@ -5094,7 +5094,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -5107,10 +5107,10 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 ; SI-NEXT: s_addc_u32 s5, s1, s5 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 @@ -5146,10 +5146,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_max_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; VI-NEXT: s_add_u32 s4, s0, s4 ; VI-NEXT: s_addc_u32 s5, s1, s5 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -5180,23 +5180,23 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_max_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -5204,9 +5204,9 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB90_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -5219,7 +5219,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s6, s0, s6 @@ -5300,16 +5300,16 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -5317,7 +5317,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc @@ -5330,7 +5330,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -6328,10 +6328,10 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_umax_i64_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 ; SI-NEXT: s_addc_u32 s5, s1, s5 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 @@ -6367,8 +6367,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6403,23 +6403,23 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc @@ -6427,9 +6427,9 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB102_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -6443,7 +6443,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_umax_i64_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -6487,7 +6487,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6526,16 +6526,16 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -6543,7 +6543,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc @@ -6556,7 +6556,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -6569,7 +6569,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_umax_i64_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -6613,7 +6613,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s6, s0, s6 @@ -6650,16 +6650,16 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -6667,7 +6667,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc @@ -6680,7 +6680,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -8664,10 +8664,10 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_min_i64_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; SI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 ; SI-NEXT: s_addc_u32 s5, s1, s5 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 @@ -8703,8 +8703,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8739,23 +8739,23 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc @@ -8763,9 +8763,9 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB125_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -8779,7 +8779,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_min_i64_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -8823,7 +8823,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8862,16 +8862,16 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -8879,7 +8879,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8] +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc @@ -8892,7 +8892,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index @@ -8905,7 +8905,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: atomic_min_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SI-NEXT: s_mov_b64 s[8:9], 0 @@ -8942,7 +8942,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_min_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 @@ -8972,29 +8972,29 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_min_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB127_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -9006,7 +9006,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_min_i64_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -9050,7 +9050,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s6, s0, s6 @@ -9087,16 +9087,16 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 +; GFX9-NEXT: s_add_u32 s0, s8, s0 +; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -9104,7 +9104,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8] +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc @@ -9117,7 +9117,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 7cc78b47a72214..30ae461d5de5ad 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -18,15 +18,15 @@ declare double @div.double.value() define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,23 +54,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -118,22 +118,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -141,8 +141,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm @@ -150,14 +150,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 @@ -177,7 +177,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 @@ -189,15 +189,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -225,23 +225,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -257,23 +257,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -289,22 +289,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -312,8 +312,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -321,14 +321,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 @@ -348,7 +348,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 @@ -369,24 +369,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -440,24 +441,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -506,24 +508,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -572,24 +575,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -633,22 +637,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop @@ -681,22 +686,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop @@ -733,26 +739,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -782,24 +789,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -863,24 +871,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -939,24 +948,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,22 +1015,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] @@ -1067,22 +1078,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 @@ -1131,17 +1143,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -1179,11 +1191,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1191,7 +1203,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1221,18 +1233,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1261,9 +1273,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_mov_b32 s14, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 @@ -1272,12 +1284,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1286,8 +1298,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm @@ -1310,7 +1322,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -1336,7 +1348,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 @@ -1351,17 +1363,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -1399,11 +1411,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -1411,7 +1423,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1441,18 +1453,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1481,9 +1493,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 @@ -1492,12 +1504,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1506,8 +1518,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -1530,7 +1542,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -1556,7 +1568,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 @@ -1577,24 +1589,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1648,24 +1661,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1714,24 +1728,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1780,24 +1795,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1841,22 +1857,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop @@ -1889,22 +1906,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop @@ -1941,26 +1959,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1990,24 +2009,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2071,24 +2091,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2147,24 +2168,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2213,22 +2235,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] @@ -2275,22 +2298,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 @@ -2339,17 +2363,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -2387,11 +2411,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2399,7 +2423,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2429,18 +2453,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2469,9 +2493,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s14, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 @@ -2480,12 +2504,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2494,8 +2518,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm @@ -2518,7 +2542,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2549,7 +2573,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -2560,14 +2584,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2576,9 +2600,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm @@ -2589,17 +2613,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -2637,11 +2661,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -2649,7 +2673,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2679,18 +2703,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2719,9 +2743,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 @@ -2730,12 +2754,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2744,8 +2768,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -2768,7 +2792,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2799,7 +2823,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -2810,14 +2834,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2826,9 +2850,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -2845,24 +2869,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -2916,24 +2941,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2982,24 +3008,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3048,24 +3075,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -3109,22 +3137,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop @@ -3157,22 +3186,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop @@ -3209,26 +3239,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3258,24 +3289,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3339,24 +3371,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3415,24 +3448,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3481,22 +3515,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] @@ -3543,22 +3578,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 @@ -3609,24 +3645,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3680,24 +3717,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3746,24 +3784,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3812,24 +3851,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -3873,22 +3913,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB6_1: ; %ComputeLoop @@ -3921,22 +3962,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB6_1: ; %ComputeLoop @@ -3973,26 +4015,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4022,24 +4065,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4103,24 +4147,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4179,24 +4224,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4245,22 +4291,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] @@ -4307,22 +4354,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 @@ -4371,17 +4419,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -4419,11 +4467,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -4431,7 +4479,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4461,18 +4509,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -4501,9 +4549,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_mov_b32 s14, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_3 @@ -4512,12 +4560,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4526,8 +4574,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm @@ -4550,7 +4598,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4581,7 +4629,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -4592,14 +4640,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4608,9 +4656,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm @@ -4621,17 +4669,17 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -4669,11 +4717,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -4681,7 +4729,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4711,18 +4759,18 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -4751,9 +4799,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 @@ -4762,12 +4810,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4776,8 +4824,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -4800,7 +4848,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4831,7 +4879,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -4842,14 +4890,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4858,9 +4906,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -4876,24 +4924,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -4947,24 +4996,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5013,24 +5063,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -5079,24 +5130,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -5140,22 +5192,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop @@ -5201,22 +5254,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop @@ -5266,26 +5320,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5315,24 +5370,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5396,24 +5452,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5472,24 +5529,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5538,22 +5596,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] @@ -5613,22 +5672,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 @@ -5691,10 +5751,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 @@ -5702,16 +5762,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 ; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4 @@ -5733,7 +5794,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -5741,25 +5802,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm @@ -5769,34 +5831,35 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s10 ; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start @@ -5808,35 +5871,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm @@ -5847,28 +5911,29 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s10, 0 +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s11, v3 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[10:11] +; GFX1064-NEXT: s_mov_b32 s42, s9 ; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -5887,34 +5952,35 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm @@ -5922,30 +5988,31 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s9 +; GFX1032-NEXT: s_mov_b32 s33, s10 ; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -5964,62 +6031,64 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s11, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[10:11] +; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -6037,23 +6106,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6061,8 +6130,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-NEXT: .LBB9_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6070,26 +6139,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s6 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s8 +; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -6106,28 +6176,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-NEXT: .LBB9_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6140,10 +6210,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 @@ -6151,16 +6221,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4 @@ -6182,7 +6253,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -6190,25 +6261,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-DPP-NEXT: .LBB9_3: ; GFX7LESS-DPP-NEXT: s_endpgm @@ -6218,34 +6290,35 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start @@ -6257,35 +6330,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 ; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm @@ -6296,28 +6370,29 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s10, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s11, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11] +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6336,34 +6411,35 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -6371,30 +6447,31 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6413,62 +6490,64 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s11, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11] +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6486,23 +6565,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6510,8 +6589,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6519,26 +6598,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s6 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6555,28 +6635,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6593,34 +6673,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1 @@ -6648,7 +6730,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) @@ -6662,7 +6744,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -6670,10 +6752,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -6682,13 +6765,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 ; GFX7LESS-NEXT: .LBB10_5: ; GFX7LESS-NEXT: s_endpgm @@ -6699,34 +6782,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b32 s43, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v42, 1 @@ -6748,11 +6833,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB10_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6762,35 +6847,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB10_4 ; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm @@ -6801,34 +6887,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s43, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s42, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec @@ -6850,11 +6938,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB10_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -6866,34 +6954,35 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm @@ -6904,34 +6993,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s43, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s42, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo @@ -6947,16 +7038,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB10_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -6968,60 +7059,63 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s43, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s42, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v41, 0 @@ -7049,11 +7143,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB10_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start @@ -7065,23 +7159,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7089,8 +7183,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7098,8 +7192,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -7107,10 +7201,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 ; GFX1132-NEXT: s_mov_b32 s13, s14 @@ -7118,6 +7213,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: v_mov_b32_e32 v41, 0 @@ -7137,17 +7233,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start @@ -7159,28 +7255,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7193,15 +7289,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -7209,25 +7306,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) @@ -7241,7 +7339,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -7249,10 +7347,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -7261,51 +7360,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s54, -1 +; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 +; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] @@ -7349,55 +7450,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[42:43] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[44:45] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm @@ -7408,34 +7510,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -7477,10 +7581,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) @@ -7492,34 +7596,35 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -7530,34 +7635,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -7589,14 +7696,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -7608,60 +7715,63 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -7715,10 +7825,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7730,23 +7840,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7754,8 +7864,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7763,8 +7873,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -7772,10 +7882,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 @@ -7783,6 +7894,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -7824,14 +7936,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7843,28 +7955,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7881,17 +7993,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -7932,11 +8044,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -7944,7 +8056,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -7975,17 +8087,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -8016,24 +8128,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-NEXT: s_mov_b32 s14, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -8042,8 +8154,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-NEXT: .LBB11_3: ; GFX1032-NEXT: s_endpgm @@ -8066,7 +8178,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -8098,7 +8210,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -8109,14 +8221,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8125,9 +8237,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm @@ -8138,17 +8250,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -8189,11 +8301,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8201,7 +8313,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8232,17 +8344,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -8273,24 +8385,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -8299,8 +8411,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -8323,7 +8435,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -8355,7 +8467,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -8366,14 +8478,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8382,9 +8494,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -8400,24 +8512,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -8476,24 +8589,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8545,24 +8659,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -8614,24 +8729,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -8678,22 +8794,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec @@ -8742,22 +8859,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo @@ -8808,26 +8926,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8860,24 +8979,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8958,24 +9078,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9046,24 +9167,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9124,22 +9246,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -9213,22 +9336,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -9302,17 +9426,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -9353,11 +9477,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9365,7 +9489,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9396,17 +9520,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -9437,24 +9561,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_mov_b32 s14, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -9463,8 +9587,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm @@ -9487,7 +9611,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -9519,7 +9643,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -9530,14 +9654,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9546,9 +9670,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm @@ -9559,17 +9683,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -9610,11 +9734,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9622,7 +9746,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9653,17 +9777,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -9694,24 +9818,24 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -9720,8 +9844,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -9744,7 +9868,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -9776,7 +9900,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -9787,14 +9911,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9803,9 +9927,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -9821,24 +9945,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -9897,24 +10022,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9966,24 +10092,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -10035,24 +10162,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -10099,22 +10227,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec @@ -10163,22 +10292,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo @@ -10229,26 +10359,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10281,24 +10412,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10379,24 +10511,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10467,24 +10600,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10545,22 +10679,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -10634,22 +10769,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -10724,24 +10860,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -10800,24 +10937,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -10869,24 +11007,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -10938,24 +11077,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -11002,22 +11142,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec @@ -11066,22 +11207,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo @@ -11132,26 +11274,27 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11184,24 +11327,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11282,24 +11426,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11370,24 +11515,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11448,22 +11594,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -11537,22 +11684,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -11627,9 +11775,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 @@ -11642,15 +11790,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -11671,7 +11820,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -11679,25 +11828,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm @@ -11708,10 +11858,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 @@ -11724,19 +11874,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start @@ -11748,35 +11899,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm @@ -11787,9 +11939,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_add_u32 s48, s48, s11 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 @@ -11801,18 +11953,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -11827,34 +11980,35 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm @@ -11865,12 +12019,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] @@ -11879,17 +12033,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -11904,41 +12059,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -11959,15 +12115,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -11983,23 +12140,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12007,8 +12164,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-NEXT: .LBB16_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12016,12 +12173,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12036,14 +12193,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12058,28 +12216,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-NEXT: .LBB16_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12092,9 +12250,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -12107,15 +12265,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -12136,7 +12295,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -12144,25 +12303,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-DPP-NEXT: .LBB16_3: ; GFX7LESS-DPP-NEXT: s_endpgm @@ -12173,10 +12333,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s50, -1 ; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -12189,19 +12349,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start @@ -12213,35 +12374,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm @@ -12252,9 +12414,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 @@ -12266,18 +12428,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12292,34 +12455,35 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -12330,12 +12494,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] @@ -12344,17 +12508,18 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12369,41 +12534,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12424,15 +12590,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12448,23 +12615,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12472,8 +12639,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12481,12 +12648,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12501,14 +12668,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12523,28 +12691,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12561,34 +12729,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1 @@ -12616,7 +12786,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) @@ -12630,7 +12800,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -12638,10 +12808,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -12650,13 +12821,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 ; GFX7LESS-NEXT: .LBB17_5: ; GFX7LESS-NEXT: s_endpgm @@ -12667,34 +12838,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b32 s43, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v42, 1 @@ -12716,11 +12889,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB17_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -12730,35 +12903,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB17_4 ; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm @@ -12769,34 +12943,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s43, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s42, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec @@ -12818,11 +12994,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB17_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -12834,34 +13010,35 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm @@ -12872,34 +13049,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s43, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s42, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo @@ -12915,16 +13094,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB17_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -12936,60 +13115,63 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s43, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s42, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v41, 0 @@ -13017,11 +13199,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB17_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start @@ -13033,23 +13215,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13057,8 +13239,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13066,8 +13248,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -13075,10 +13257,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 ; GFX1132-NEXT: s_mov_b32 s13, s14 @@ -13086,6 +13269,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: v_mov_b32_e32 v41, 0 @@ -13105,17 +13289,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB17_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start @@ -13127,28 +13311,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13161,15 +13345,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -13177,25 +13362,26 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) @@ -13209,7 +13395,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -13217,10 +13403,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -13229,51 +13416,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s54, -1 +; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 +; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] @@ -13317,55 +13506,56 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] ; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[42:43] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[44:45] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm @@ -13376,34 +13566,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -13445,10 +13637,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) @@ -13460,34 +13652,35 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -13498,34 +13691,36 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -13557,14 +13752,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -13576,60 +13771,63 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -13683,10 +13881,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start @@ -13698,23 +13896,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13722,8 +13920,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13731,8 +13929,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -13740,10 +13938,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 @@ -13751,6 +13950,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -13792,14 +13992,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start @@ -13811,28 +14011,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13845,15 +14045,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB18_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -13881,23 +14081,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB18_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -13913,23 +14113,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB18_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -13945,22 +14145,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB18_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -13968,8 +14168,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1032-NEXT: .LBB18_3: ; GFX1032-NEXT: s_endpgm @@ -13977,14 +14177,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB18_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 @@ -14004,7 +14204,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB18_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 @@ -14016,15 +14216,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -14052,23 +14252,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14084,23 +14284,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14116,22 +14316,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14139,8 +14339,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1032-DPP-NEXT: .LBB18_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -14148,14 +14348,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 @@ -14175,7 +14375,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 @@ -14191,15 +14391,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB19_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -14227,23 +14427,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB19_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14259,23 +14459,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB19_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14291,22 +14491,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB19_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14314,8 +14514,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1032-NEXT: .LBB19_3: ; GFX1032-NEXT: s_endpgm @@ -14323,14 +14523,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB19_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 @@ -14350,7 +14550,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB19_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 @@ -14362,15 +14562,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -14398,23 +14598,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14430,23 +14630,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14462,22 +14662,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14485,8 +14685,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1032-DPP-NEXT: .LBB19_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -14494,14 +14694,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 @@ -14521,7 +14721,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 8e7181a0cf4495..cb0452d4c99b59 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -24,7 +24,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -86,7 +86,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -101,7 +101,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -118,7 +118,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -134,7 +134,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -149,7 +149,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -182,7 +182,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -211,7 +211,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -226,7 +226,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -259,7 +259,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -277,24 +277,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -352,24 +353,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -422,24 +424,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -478,24 +481,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -529,22 +533,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop @@ -579,22 +584,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop @@ -633,26 +639,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -684,24 +691,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -774,24 +782,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -846,24 +855,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -906,22 +916,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] @@ -978,22 +989,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 @@ -1050,7 +1062,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -1083,7 +1095,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1112,7 +1124,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1127,7 +1139,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1144,7 +1156,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -1160,7 +1172,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -1175,7 +1187,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -1208,7 +1220,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1237,7 +1249,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1252,7 +1264,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1269,7 +1281,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1285,7 +1297,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -1304,24 +1316,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1379,24 +1392,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1449,24 +1463,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1505,24 +1520,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1556,22 +1572,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop @@ -1606,22 +1623,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop @@ -1660,26 +1678,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1711,24 +1730,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1801,24 +1821,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1873,24 +1894,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1933,22 +1955,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] @@ -2005,22 +2028,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 @@ -2078,7 +2102,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -2111,7 +2135,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2140,7 +2164,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2155,7 +2179,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2172,7 +2196,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -2188,7 +2212,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -2203,7 +2227,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -2236,7 +2260,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2265,7 +2289,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2280,7 +2304,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2297,7 +2321,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2313,7 +2337,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -2331,24 +2355,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -2406,24 +2431,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2476,24 +2502,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2532,24 +2559,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2583,22 +2611,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop @@ -2633,22 +2662,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop @@ -2687,26 +2717,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2738,24 +2769,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2828,24 +2860,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2900,24 +2933,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2960,22 +2994,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] @@ -3032,22 +3067,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 @@ -3103,24 +3139,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -3143,7 +3180,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -3151,24 +3188,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm @@ -3180,28 +3218,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3213,36 +3252,37 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm @@ -3255,26 +3295,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_add_u32 s48, s48, s11 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -3289,35 +3330,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm @@ -3329,26 +3371,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -3363,35 +3406,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm @@ -3400,7 +3444,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3408,15 +3452,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -3438,18 +3483,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3457,8 +3502,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-NEXT: .LBB6_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3468,21 +3513,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -3501,26 +3547,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-NEXT: .LBB6_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3533,24 +3579,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -3573,7 +3620,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -3581,24 +3628,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-DPP-NEXT: .LBB6_3: ; GFX7LESS-DPP-NEXT: s_endpgm @@ -3610,28 +3658,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s50, -1 ; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3643,36 +3692,37 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm @@ -3685,26 +3735,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3719,35 +3770,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -3759,26 +3811,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3793,35 +3846,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -3830,7 +3884,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3838,15 +3892,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3868,18 +3923,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3887,8 +3942,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3898,21 +3953,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -3931,26 +3987,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3967,34 +4023,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 @@ -4024,7 +4082,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4041,7 +4099,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -4049,10 +4107,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -4060,13 +4119,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 ; GFX7LESS-NEXT: .LBB7_5: ; GFX7LESS-NEXT: s_endpgm @@ -4077,34 +4136,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b32 s43, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 @@ -4128,12 +4189,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB7_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4143,36 +4204,37 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v2, s42 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB7_4 ; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm @@ -4183,34 +4245,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s43, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s42, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec @@ -4234,12 +4298,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB7_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -4251,35 +4315,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1064-NEXT: .LBB7_5: ; GFX1064-NEXT: s_endpgm @@ -4290,34 +4355,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s43, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s42, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo @@ -4335,17 +4402,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB7_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -4357,19 +4424,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] @@ -4378,40 +4446,42 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1032-NEXT: .LBB7_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s43, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s42, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -4441,12 +4511,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB7_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[42:43] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start @@ -4460,31 +4530,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4492,8 +4562,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -4501,10 +4571,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 ; GFX1132-NEXT: s_mov_b32 s13, s14 @@ -4512,6 +4583,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -4534,19 +4606,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[42:43] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start @@ -4560,30 +4632,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s43 +; GFX1132-NEXT: v_mov_b32_e32 v3, s45 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v2, s42 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s44 ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off +; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4596,15 +4667,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -4612,23 +4684,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4644,7 +4717,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -4652,10 +4725,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -4663,51 +4737,53 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s54, -1 +; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 +; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] @@ -4758,20 +4834,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] ; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[42:43], s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -4779,36 +4855,37 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm @@ -4819,34 +4896,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -4896,10 +4975,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] @@ -4912,17 +4991,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] @@ -4930,17 +5010,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -4951,34 +5031,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -5016,15 +5098,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -5036,61 +5118,64 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -5154,11 +5239,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start @@ -5176,18 +5261,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] -; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -5195,8 +5280,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -5204,8 +5289,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5213,10 +5298,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 @@ -5224,6 +5310,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -5269,16 +5356,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start @@ -5294,26 +5381,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] -; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -5332,7 +5419,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -5369,7 +5456,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5400,7 +5487,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -5416,7 +5503,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -5434,7 +5521,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -5462,18 +5549,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5483,9 +5570,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm @@ -5498,7 +5585,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -5535,7 +5622,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5566,7 +5653,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -5582,7 +5669,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -5600,7 +5687,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -5628,18 +5715,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5649,9 +5736,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -5667,24 +5754,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5747,24 +5835,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5820,24 +5909,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -5878,24 +5968,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -5931,22 +6022,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec @@ -6000,22 +6092,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo @@ -6073,26 +6166,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6127,24 +6221,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6234,24 +6329,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6317,24 +6413,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6388,22 +6485,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -6490,22 +6588,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, 0x7ff80000 :: v_dual_mov_b32 v2, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0 @@ -6588,24 +6687,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -6628,7 +6728,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -6636,24 +6736,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm @@ -6665,28 +6766,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start @@ -6698,36 +6800,37 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm @@ -6740,26 +6843,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_add_u32 s48, s48, s11 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -6774,35 +6878,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm @@ -6814,26 +6919,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -6848,35 +6954,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm @@ -6885,7 +6992,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6893,15 +7000,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -6923,18 +7031,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6942,8 +7050,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-NEXT: .LBB10_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6953,21 +7061,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -6986,26 +7095,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-NEXT: .LBB10_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7018,24 +7127,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -7058,7 +7168,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -7066,24 +7176,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-DPP-NEXT: .LBB10_3: ; GFX7LESS-DPP-NEXT: s_endpgm @@ -7095,28 +7206,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s50, -1 ; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7128,36 +7240,37 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm @@ -7170,26 +7283,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7204,35 +7318,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -7244,26 +7359,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7278,35 +7394,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -7315,7 +7432,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7323,15 +7440,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7353,18 +7471,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7372,8 +7490,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7383,21 +7501,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7416,26 +7535,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7452,34 +7571,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 @@ -7509,7 +7630,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7526,7 +7647,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -7534,10 +7655,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -7545,13 +7667,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 ; GFX7LESS-NEXT: .LBB11_5: ; GFX7LESS-NEXT: s_endpgm @@ -7562,34 +7684,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b32 s43, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 @@ -7613,12 +7737,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB11_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7628,36 +7752,37 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v2, s42 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB11_4 ; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm @@ -7668,34 +7793,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s43, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s42, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec @@ -7719,12 +7846,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB11_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -7736,35 +7863,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm @@ -7775,34 +7903,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s43, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s42, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo @@ -7820,17 +7950,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB11_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -7842,19 +7972,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] @@ -7863,40 +7994,42 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s43, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s42, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -7926,12 +8059,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB11_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[42:43] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start @@ -7945,31 +8078,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7977,8 +8110,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -7986,10 +8119,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 ; GFX1132-NEXT: s_mov_b32 s13, s14 @@ -7997,6 +8131,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -8019,19 +8154,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB11_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[42:43] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start @@ -8045,30 +8180,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s43 +; GFX1132-NEXT: v_mov_b32_e32 v3, s45 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v2, s42 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s44 ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off +; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8081,15 +8215,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -8097,23 +8232,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8129,7 +8265,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -8137,10 +8273,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -8148,51 +8285,53 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s54, -1 +; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 +; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] @@ -8243,20 +8382,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] ; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[42:43], s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -8264,36 +8403,37 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm @@ -8304,34 +8444,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -8381,10 +8523,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] @@ -8397,17 +8539,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] @@ -8415,17 +8558,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -8436,34 +8579,36 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -8501,15 +8646,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -8521,61 +8666,64 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -8639,11 +8787,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start @@ -8661,18 +8809,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] -; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8680,8 +8828,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8689,8 +8837,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8698,10 +8846,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 @@ -8709,6 +8858,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -8754,16 +8904,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start @@ -8779,26 +8929,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] -; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8817,7 +8967,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -8850,7 +9000,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8879,7 +9029,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -8894,7 +9044,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB12_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -8911,7 +9061,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB12_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -8927,7 +9077,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -8942,7 +9092,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -8975,7 +9125,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9004,7 +9154,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9019,7 +9169,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9036,7 +9186,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9052,7 +9202,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -9071,7 +9221,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -9104,7 +9254,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9133,7 +9283,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -9148,7 +9298,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -9165,7 +9315,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB13_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -9181,7 +9331,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB13_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -9196,7 +9346,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -9229,7 +9379,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9258,7 +9408,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9273,7 +9423,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9290,7 +9440,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9306,7 +9456,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index b95b52168625dd..75447fc971c8b8 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -24,7 +24,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -86,7 +86,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -101,7 +101,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -118,7 +118,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -134,7 +134,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -149,7 +149,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -182,7 +182,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -211,7 +211,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -226,7 +226,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -259,7 +259,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -277,24 +277,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -352,24 +353,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -422,24 +424,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -478,24 +481,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -529,22 +533,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop @@ -579,22 +584,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop @@ -633,26 +639,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -684,24 +691,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -774,24 +782,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -846,24 +855,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -906,22 +916,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] @@ -978,22 +989,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 @@ -1050,7 +1062,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -1083,7 +1095,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1112,7 +1124,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1127,7 +1139,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1144,7 +1156,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -1160,7 +1172,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -1175,7 +1187,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -1208,7 +1220,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1237,7 +1249,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1252,7 +1264,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1269,7 +1281,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1285,7 +1297,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -1304,24 +1316,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1379,24 +1392,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1449,24 +1463,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1505,24 +1520,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1556,22 +1572,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop @@ -1606,22 +1623,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop @@ -1660,26 +1678,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1711,24 +1730,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1801,24 +1821,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1873,24 +1894,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1933,22 +1955,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] @@ -2005,22 +2028,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 @@ -2078,7 +2102,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -2111,7 +2135,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2140,7 +2164,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2155,7 +2179,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2172,7 +2196,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -2188,7 +2212,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -2203,7 +2227,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -2236,7 +2260,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2265,7 +2289,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2280,7 +2304,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2297,7 +2321,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2313,7 +2337,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -2331,24 +2355,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -2406,24 +2431,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2476,24 +2502,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2532,24 +2559,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2583,22 +2611,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop @@ -2633,22 +2662,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop @@ -2687,26 +2717,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2738,24 +2769,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2828,24 +2860,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2900,24 +2933,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2960,22 +2994,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1] @@ -3032,22 +3067,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0 @@ -3103,24 +3139,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -3143,7 +3180,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -3151,24 +3188,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm @@ -3180,28 +3218,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3213,36 +3252,37 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm @@ -3255,26 +3295,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_add_u32 s48, s48, s11 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -3289,35 +3330,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm @@ -3329,26 +3371,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -3363,35 +3406,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm @@ -3400,7 +3444,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3408,15 +3452,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -3438,18 +3483,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3457,8 +3502,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-NEXT: .LBB6_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3468,21 +3513,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -3501,26 +3547,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-NEXT: .LBB6_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3533,24 +3579,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -3573,7 +3620,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -3581,24 +3628,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-DPP-NEXT: .LBB6_3: ; GFX7LESS-DPP-NEXT: s_endpgm @@ -3610,28 +3658,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s50, -1 ; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start @@ -3643,36 +3692,37 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm @@ -3685,26 +3735,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3719,35 +3770,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -3759,26 +3811,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3793,35 +3846,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -3830,7 +3884,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3838,15 +3892,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -3868,18 +3923,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3887,8 +3942,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3898,21 +3953,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -3931,26 +3987,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -3967,34 +4023,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 @@ -4024,7 +4082,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4041,7 +4099,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -4049,10 +4107,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -4060,13 +4119,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_4 ; GFX7LESS-NEXT: .LBB7_5: ; GFX7LESS-NEXT: s_endpgm @@ -4077,34 +4136,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b32 s43, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 @@ -4128,12 +4189,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB7_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4143,36 +4204,37 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v2, s42 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB7_4 ; GFX9-NEXT: .LBB7_5: ; GFX9-NEXT: s_endpgm @@ -4183,34 +4245,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s43, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s42, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec @@ -4234,12 +4298,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB7_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -4251,35 +4315,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1064-NEXT: .LBB7_5: ; GFX1064-NEXT: s_endpgm @@ -4290,34 +4355,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s43, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s42, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo @@ -4335,17 +4402,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB7_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -4357,19 +4424,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] @@ -4378,40 +4446,42 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1032-NEXT: .LBB7_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s43, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s42, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -4441,12 +4511,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB7_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[42:43] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start @@ -4460,31 +4530,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1164-NEXT: .LBB7_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4492,8 +4562,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -4501,10 +4571,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 ; GFX1132-NEXT: s_mov_b32 s13, s14 @@ -4512,6 +4583,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -4534,19 +4606,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[42:43] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start @@ -4560,30 +4632,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s43 +; GFX1132-NEXT: v_mov_b32_e32 v3, s45 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v2, s42 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s44 ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off +; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_4 ; GFX1132-NEXT: .LBB7_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4596,15 +4667,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -4612,23 +4684,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4644,7 +4717,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -4652,10 +4725,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -4663,51 +4737,53 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s54, -1 +; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 +; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] @@ -4758,20 +4834,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] ; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[42:43], s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -4779,36 +4855,37 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX9-DPP-NEXT: .LBB7_3: ; GFX9-DPP-NEXT: s_endpgm @@ -4819,34 +4896,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -4896,10 +4975,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] @@ -4912,17 +4991,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] @@ -4930,17 +5010,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1064-DPP-NEXT: .LBB7_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -4951,34 +5031,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -5016,15 +5098,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -5036,61 +5118,64 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -5154,11 +5239,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start @@ -5176,18 +5261,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] -; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -5195,8 +5280,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1164-DPP-NEXT: .LBB7_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -5204,8 +5289,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5213,10 +5298,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 @@ -5224,6 +5310,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -5269,16 +5356,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start @@ -5294,26 +5381,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] -; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -5332,7 +5419,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -5369,7 +5456,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5400,7 +5487,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -5416,7 +5503,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -5434,7 +5521,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -5462,18 +5549,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5483,9 +5570,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm @@ -5498,7 +5585,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -5535,7 +5622,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5566,7 +5653,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -5582,7 +5669,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -5600,7 +5687,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -5628,18 +5715,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5649,9 +5736,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -5667,24 +5754,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5747,24 +5835,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5820,24 +5909,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -5878,24 +5968,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -5931,22 +6022,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec @@ -6000,22 +6092,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_mov_b32_e32 v5, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo @@ -6073,26 +6166,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6127,24 +6221,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6234,24 +6329,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6317,24 +6413,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6388,22 +6485,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -6490,22 +6588,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, 0x7ff80000 :: v_dual_mov_b32 v2, 0 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0 @@ -6588,24 +6687,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -6628,7 +6728,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -6636,24 +6736,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm @@ -6665,28 +6766,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start @@ -6698,36 +6800,37 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm @@ -6740,26 +6843,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_add_u32 s48, s48, s11 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -6774,35 +6878,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm @@ -6814,26 +6919,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -6848,35 +6954,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm @@ -6885,7 +6992,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6893,15 +7000,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -6923,18 +7031,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6942,8 +7050,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-NEXT: .LBB10_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6953,21 +7061,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -6986,26 +7095,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-NEXT: .LBB10_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7018,24 +7127,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -7058,7 +7168,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -7066,24 +7176,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-DPP-NEXT: .LBB10_3: ; GFX7LESS-DPP-NEXT: s_endpgm @@ -7095,28 +7206,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s50, -1 ; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -7128,36 +7240,37 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm @@ -7170,26 +7283,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7204,35 +7318,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -7244,26 +7359,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7278,35 +7394,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -7315,7 +7432,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7323,15 +7440,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7353,18 +7471,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7372,8 +7490,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7383,21 +7501,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7416,26 +7535,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7452,34 +7571,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 @@ -7509,7 +7630,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7526,7 +7647,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -7534,10 +7655,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -7545,13 +7667,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_4 ; GFX7LESS-NEXT: .LBB11_5: ; GFX7LESS-NEXT: s_endpgm @@ -7562,34 +7684,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b32 s43, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 @@ -7613,12 +7737,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB11_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7628,36 +7752,37 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: v_mov_b32_e32 v2, s42 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB11_4 ; GFX9-NEXT: .LBB11_5: ; GFX9-NEXT: s_endpgm @@ -7668,34 +7793,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s43, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s42, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec @@ -7719,12 +7846,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB11_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -7736,35 +7863,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1064-NEXT: .LBB11_5: ; GFX1064-NEXT: s_endpgm @@ -7775,34 +7903,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s43, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s42, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo @@ -7820,17 +7950,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB11_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[42:43] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -7842,19 +7972,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] @@ -7863,40 +7994,42 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1032-NEXT: .LBB11_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s43, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s42, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 @@ -7926,12 +8059,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB11_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[42:43] +; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start @@ -7945,31 +8078,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1164-NEXT: .LBB11_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7977,8 +8110,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -7986,10 +8119,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 ; GFX1132-NEXT: s_mov_b32 s13, s14 @@ -7997,6 +8131,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 @@ -8019,19 +8154,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB11_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[42:43] +; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start @@ -8045,30 +8180,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v40 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s43 +; GFX1132-NEXT: v_mov_b32_e32 v3, s45 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v2, s42 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v2, s44 ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off +; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 -; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1132-NEXT: .LBB11_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8081,15 +8215,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -8097,23 +8232,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8129,7 +8265,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -8137,10 +8273,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -8148,51 +8285,53 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s54, -1 +; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 +; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] @@ -8243,20 +8382,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] ; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[42:43], s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], s[44:45], s[44:45] ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -8264,36 +8403,37 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX9-DPP-NEXT: .LBB11_3: ; GFX9-DPP-NEXT: s_endpgm @@ -8304,34 +8444,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -8381,10 +8523,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] @@ -8397,17 +8539,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] @@ -8415,17 +8558,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1064-DPP-NEXT: .LBB11_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -8436,34 +8579,36 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -8501,15 +8646,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -8521,61 +8666,64 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -8639,11 +8787,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start @@ -8661,18 +8809,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] -; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8680,8 +8828,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1164-DPP-NEXT: .LBB11_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8689,8 +8837,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8698,10 +8846,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 @@ -8709,6 +8858,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -8754,16 +8904,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start @@ -8779,26 +8929,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] -; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8817,7 +8967,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -8850,7 +9000,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8879,7 +9029,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -8894,7 +9044,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB12_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -8911,7 +9061,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB12_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -8927,7 +9077,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -8942,7 +9092,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -8975,7 +9125,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9004,7 +9154,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9019,7 +9169,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9036,7 +9186,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9052,7 +9202,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -9071,7 +9221,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -9104,7 +9254,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9133,7 +9283,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -9148,7 +9298,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -9165,7 +9315,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB13_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -9181,7 +9331,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB13_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -9196,7 +9346,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -9229,7 +9379,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9258,7 +9408,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9273,7 +9423,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9290,7 +9440,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9306,7 +9456,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index ae4ca7b7356ef9..6dc3a1971a485f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -18,15 +18,15 @@ declare double @div.double.value() define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,23 +54,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -118,22 +118,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -141,33 +141,33 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -185,23 +185,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -210,24 +210,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -255,23 +255,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -287,23 +287,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -319,22 +319,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -342,33 +342,33 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -386,23 +386,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -411,9 +411,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1132-DPP-NEXT: .LBB0_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -429,24 +429,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -500,24 +501,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -566,24 +568,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -632,24 +635,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -693,22 +697,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop @@ -754,22 +759,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop @@ -819,26 +825,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -868,24 +875,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -949,24 +957,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1025,24 +1034,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1091,22 +1101,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] @@ -1166,22 +1177,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 @@ -1243,17 +1255,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -1291,11 +1303,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1303,7 +1315,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1333,18 +1345,18 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1373,9 +1385,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_mov_b32 s14, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 @@ -1384,12 +1396,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1398,8 +1410,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm @@ -1422,7 +1434,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1453,7 +1465,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -1464,14 +1476,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1480,9 +1492,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm @@ -1493,17 +1505,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -1541,11 +1553,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -1553,7 +1565,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1583,18 +1595,18 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1623,9 +1635,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 @@ -1634,12 +1646,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1648,8 +1660,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -1672,7 +1684,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1703,7 +1715,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -1714,14 +1726,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1730,9 +1742,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -1749,24 +1761,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1820,24 +1833,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1886,24 +1900,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1952,24 +1967,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2013,22 +2029,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop @@ -2074,22 +2091,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop @@ -2139,26 +2157,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2188,24 +2207,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2269,24 +2289,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2345,24 +2366,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2411,22 +2433,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] @@ -2486,22 +2509,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 @@ -2563,17 +2587,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -2611,11 +2635,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2623,7 +2647,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2653,18 +2677,18 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2693,9 +2717,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s14, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 @@ -2704,12 +2728,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2718,8 +2742,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm @@ -2742,7 +2766,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2773,7 +2797,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -2784,14 +2808,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2800,9 +2824,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm @@ -2813,17 +2837,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -2861,11 +2885,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -2873,7 +2897,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2903,18 +2927,18 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2943,9 +2967,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 @@ -2954,12 +2978,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2968,8 +2992,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -2992,7 +3016,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -3023,7 +3047,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -3034,14 +3058,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3050,9 +3074,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -3069,24 +3093,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3140,24 +3165,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3206,24 +3232,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3272,24 +3299,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -3333,22 +3361,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop @@ -3394,22 +3423,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop @@ -3459,26 +3489,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3508,24 +3539,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3589,24 +3621,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3665,24 +3698,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3731,22 +3765,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] @@ -3806,22 +3841,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 @@ -3885,24 +3921,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -3956,24 +3993,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4022,24 +4060,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -4088,24 +4127,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -4149,22 +4189,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB6_1: ; %ComputeLoop @@ -4210,22 +4251,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB6_1: ; %ComputeLoop @@ -4275,26 +4317,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4324,24 +4367,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4405,24 +4449,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4481,24 +4526,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4547,22 +4593,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] @@ -4622,22 +4669,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 @@ -4699,17 +4747,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -4747,11 +4795,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -4759,7 +4807,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4789,18 +4837,18 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -4829,9 +4877,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_mov_b32 s14, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_3 @@ -4840,12 +4888,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4854,8 +4902,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm @@ -4878,7 +4926,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4909,7 +4957,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -4920,14 +4968,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4936,9 +4984,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm @@ -4949,17 +4997,17 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -4997,11 +5045,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5009,7 +5057,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5039,18 +5087,18 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -5079,9 +5127,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 @@ -5090,12 +5138,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5104,8 +5152,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -5128,7 +5176,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -5159,7 +5207,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -5170,14 +5218,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5186,9 +5234,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -5204,24 +5252,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -5275,24 +5324,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5341,24 +5391,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -5407,24 +5458,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -5468,22 +5520,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop @@ -5529,22 +5582,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop @@ -5594,26 +5648,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5643,24 +5698,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5724,24 +5780,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5800,24 +5857,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5866,22 +5924,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1] @@ -5941,22 +6000,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0 @@ -6019,10 +6079,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 @@ -6030,16 +6090,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 ; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4 @@ -6061,7 +6122,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -6069,25 +6130,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm @@ -6097,34 +6159,35 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s10 ; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-NEXT: s_mov_b32 s42, s9 +; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start @@ -6136,35 +6199,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm @@ -6175,28 +6239,29 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s10, 0 +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s11, v3 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[10:11] +; GFX1064-NEXT: s_mov_b32 s42, s9 ; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6215,34 +6280,35 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm @@ -6250,30 +6316,31 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s9 +; GFX1032-NEXT: s_mov_b32 s33, s10 ; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6292,62 +6359,64 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s11, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[10:11] +; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -6365,23 +6434,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6389,8 +6458,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-NEXT: .LBB9_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6398,26 +6467,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s6 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s8 +; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -6434,28 +6504,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-NEXT: .LBB9_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6468,10 +6538,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 @@ -6479,16 +6549,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 ; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4 @@ -6510,7 +6581,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -6518,25 +6589,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-DPP-NEXT: .LBB9_3: ; GFX7LESS-DPP-NEXT: s_endpgm @@ -6546,34 +6618,35 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 ; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start @@ -6585,35 +6658,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 ; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 ; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm @@ -6624,28 +6698,29 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], exec ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s10, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s11, v3 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11] +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6664,34 +6739,35 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -6699,30 +6775,31 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s9, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 ; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 @@ -6741,62 +6818,64 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s11, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[10:11] +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6814,23 +6893,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6838,8 +6917,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6847,26 +6926,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s8, exec_lo ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s6 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6883,28 +6963,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6921,34 +7001,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1 @@ -6976,7 +7058,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) @@ -6990,7 +7072,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -6998,10 +7080,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -7010,13 +7093,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4 ; GFX7LESS-NEXT: .LBB10_5: ; GFX7LESS-NEXT: s_endpgm @@ -7027,34 +7110,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b32 s43, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v42, 1 @@ -7076,11 +7161,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB10_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -7090,35 +7175,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB10_4 ; GFX9-NEXT: .LBB10_5: ; GFX9-NEXT: s_endpgm @@ -7129,34 +7215,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s43, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s42, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec @@ -7178,11 +7266,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB10_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -7194,34 +7282,35 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1064-NEXT: .LBB10_5: ; GFX1064-NEXT: s_endpgm @@ -7232,34 +7321,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s43, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s42, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo @@ -7275,16 +7366,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB10_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -7296,60 +7387,63 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1032-NEXT: .LBB10_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s43, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s42, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v41, 0 @@ -7377,11 +7471,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB10_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start @@ -7393,23 +7487,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7417,8 +7511,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1164-NEXT: .LBB10_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7426,8 +7520,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -7435,10 +7529,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 ; GFX1132-NEXT: s_mov_b32 s13, s14 @@ -7446,6 +7541,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: v_mov_b32_e32 v41, 0 @@ -7465,17 +7561,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start @@ -7487,28 +7583,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1132-NEXT: .LBB10_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7521,15 +7617,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -7537,25 +7634,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) @@ -7569,7 +7667,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -7577,10 +7675,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -7589,51 +7688,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s54, -1 +; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 +; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] @@ -7677,55 +7778,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[42:43] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[44:45] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm @@ -7736,34 +7838,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -7805,10 +7909,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) @@ -7820,34 +7924,35 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -7858,34 +7963,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -7917,14 +8024,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -7936,60 +8043,63 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -8043,10 +8153,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -8058,23 +8168,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8082,8 +8192,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8091,8 +8201,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8100,10 +8210,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 @@ -8111,6 +8222,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -8152,14 +8264,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start @@ -8171,28 +8283,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8209,17 +8321,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -8260,11 +8372,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -8272,7 +8384,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8303,17 +8415,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -8344,24 +8456,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-NEXT: s_mov_b32 s14, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -8370,8 +8482,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-NEXT: .LBB11_3: ; GFX1032-NEXT: s_endpgm @@ -8394,7 +8506,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -8426,7 +8538,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -8437,14 +8549,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8453,9 +8565,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm @@ -8466,17 +8578,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -8517,11 +8629,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8529,7 +8641,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8560,17 +8672,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -8601,24 +8713,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -8627,8 +8739,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -8651,7 +8763,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -8683,7 +8795,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -8694,14 +8806,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8710,9 +8822,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -8727,24 +8839,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -8803,24 +8916,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8872,24 +8986,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -8941,24 +9056,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -9005,22 +9121,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec @@ -9069,22 +9186,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo @@ -9135,26 +9253,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9187,24 +9306,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9285,24 +9405,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9373,24 +9494,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9451,22 +9573,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -9540,22 +9663,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -9629,17 +9753,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -9680,11 +9804,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9692,7 +9816,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9723,17 +9847,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -9764,24 +9888,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_mov_b32 s14, -1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -9790,8 +9914,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm @@ -9814,7 +9938,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -9846,7 +9970,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -9857,14 +9981,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9873,9 +9997,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm @@ -9886,17 +10010,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -9937,11 +10061,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -9949,7 +10073,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9980,17 +10104,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -10021,24 +10145,24 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 ; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -10047,8 +10171,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -10071,7 +10195,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -10103,7 +10227,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 @@ -10114,14 +10238,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10130,9 +10254,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -10148,24 +10272,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -10224,24 +10349,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -10293,24 +10419,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -10362,24 +10489,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -10426,22 +10554,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec @@ -10490,22 +10619,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo @@ -10556,26 +10686,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10608,24 +10739,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10706,24 +10838,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10794,24 +10927,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10872,22 +11006,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -10961,22 +11096,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -11051,24 +11187,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s36, s36, s9 +; GFX7LESS-NEXT: s_add_u32 s36, s36, s11 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0 -; GFX7LESS-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 -; GFX7LESS-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -11127,24 +11264,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -11196,24 +11334,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s38, -1 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_mov_b32 s12, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_getpc_b64 s[2:3] -; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-NEXT: s_mov_b32 s32, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -11265,24 +11404,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s38, -1 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_mov_b32 s12, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_getpc_b64 s[2:3] -; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-NEXT: s_mov_b32 s32, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -11329,22 +11469,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s12, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_getpc_b64 s[2:3] -; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b32 s32, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec @@ -11393,22 +11534,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-NEXT: s_getpc_b64 s[2:3] -; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: s_mov_b32 s12, s13 -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b32 s13, s14 ; GFX1132-NEXT: s_mov_b32 s14, s15 ; GFX1132-NEXT: s_mov_b32 s32, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo @@ -11459,26 +11601,27 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11511,24 +11654,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b32 s12, s6 -; GFX9-DPP-NEXT: s_mov_b32 s13, s7 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11609,24 +11753,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11697,24 +11842,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11775,22 +11921,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -11864,22 +12011,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3] -; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 -; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 @@ -11953,9 +12101,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 @@ -11968,15 +12116,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 @@ -11997,7 +12146,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -12005,25 +12154,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm @@ -12034,10 +12184,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 @@ -12050,19 +12200,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-NEXT: s_mov_b32 s43, s8 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start @@ -12074,35 +12225,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm @@ -12113,9 +12265,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_add_u32 s48, s48, s11 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 @@ -12127,18 +12279,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b32 s42, s9 +; GFX1064-NEXT: s_mov_b32 s43, s8 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -12153,34 +12306,35 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm @@ -12191,12 +12345,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] @@ -12205,17 +12359,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b32 s42, s9 +; GFX1032-NEXT: s_mov_b32 s43, s8 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -12230,41 +12385,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12285,15 +12441,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b32 s42, s9 +; GFX1164-NEXT: s_mov_b32 s43, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -12309,23 +12466,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12333,8 +12490,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-NEXT: .LBB16_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12342,12 +12499,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12362,14 +12519,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12384,28 +12542,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-NEXT: .LBB16_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12418,9 +12576,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -12433,15 +12591,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 @@ -12462,7 +12621,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -12470,25 +12629,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-DPP-NEXT: .LBB16_3: ; GFX7LESS-DPP-NEXT: s_endpgm @@ -12499,10 +12659,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-DPP-NEXT: s_mov_b32 s50, -1 ; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 @@ -12515,19 +12675,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start @@ -12539,35 +12700,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm @@ -12578,9 +12740,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 @@ -12592,18 +12754,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12618,34 +12781,35 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -12656,12 +12820,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] @@ -12670,17 +12834,18 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[44:45], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12695,41 +12860,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12750,15 +12916,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12774,23 +12941,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12798,8 +12965,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12807,12 +12974,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12827,14 +12994,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[4:5], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[44:45], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12849,28 +13017,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12887,34 +13055,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec ; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1 @@ -12942,7 +13112,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) @@ -12956,7 +13126,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -12964,10 +13134,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -12976,13 +13147,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4 ; GFX7LESS-NEXT: .LBB17_5: ; GFX7LESS-NEXT: s_endpgm @@ -12993,34 +13164,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s50, -1 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_add_u32 s48, s48, s11 ; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-NEXT: s_mov_b32 s43, s8 ; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_mov_b32 s42, s9 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s10 +; GFX9-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: v_bfrev_b32_e32 v42, 1 @@ -13042,11 +13215,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB17_5 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -13056,35 +13229,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s12, s43 +; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s45 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX9-NEXT: s_cbranch_execnz .LBB17_4 ; GFX9-NEXT: .LBB17_5: ; GFX9-NEXT: s_endpgm @@ -13095,34 +13269,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-NEXT: s_mov_b32 s50, -1 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_mov_b32 s43, s8 ; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_mov_b32 s42, s9 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b32 s33, s10 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_mov_b32_e32 v41, 0 ; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec @@ -13144,11 +13320,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB17_5 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -13160,34 +13336,35 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s12, s43 +; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1064-NEXT: .LBB17_5: ; GFX1064-NEXT: s_endpgm @@ -13198,34 +13375,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_mov_b32 s43, s8 ; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_mov_b32 s42, s9 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b32 s33, s10 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_mov_b32_e32 v41, 0 ; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo @@ -13241,16 +13420,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB17_5 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -13262,60 +13441,63 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s12, s43 +; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1032-NEXT: .LBB17_5: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-NEXT: s_mov_b32 s43, s8 ; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_mov_b32 s42, s9 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b32 s33, s10 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s6 -; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: v_mov_b32_e32 v41, 0 @@ -13343,11 +13525,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB17_5 ; GFX1164-NEXT: ; %bb.3: -; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start @@ -13359,23 +13541,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s12, s43 +; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13383,8 +13565,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1164-NEXT: .LBB17_5: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13392,8 +13574,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] @@ -13401,10 +13583,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-NEXT: s_mov_b32 s42, s14 +; GFX1132-NEXT: s_mov_b32 s43, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-NEXT: s_mov_b32 s12, s13 ; GFX1132-NEXT: s_mov_b32 s13, s14 @@ -13412,6 +13595,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s33, s15 ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: v_mov_b32_e32 v41, 0 @@ -13431,17 +13615,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s46, 0 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132-NEXT: s_cbranch_execz .LBB17_5 ; GFX1132-NEXT: ; %bb.3: -; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start @@ -13453,28 +13637,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s12, s43 +; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-NEXT: s_cbranch_execnz .LBB17_4 ; GFX1132-NEXT: .LBB17_5: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13487,15 +13671,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s10 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, s9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, s8 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -13503,25 +13688,26 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) @@ -13535,7 +13721,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -13543,10 +13729,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 +; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] @@ -13555,51 +13742,53 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s54, -1 +; GFX9-DPP-NEXT: s_mov_b32 s55, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s52, s52, s11 +; GFX9-DPP-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s43, s8 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_mov_b32 s42, s9 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: s_mov_b32 s33, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[6:7] +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] @@ -13643,55 +13832,56 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63 -; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s45, v9, 63 +; GFX9-DPP-NEXT: v_readlane_b32 s44, v8, 63 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 -; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[46:47], s[36:37], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[48:49], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[46:47] ; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[42:43] +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -s[44:45] ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[52:55], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[52:55], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[52:55], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s12, s43 +; GFX9-DPP-NEXT: s_mov_b32 s13, s42 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s46 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s47 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[52:55], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] +; GFX9-DPP-NEXT: s_or_b64 s[48:49], vcc, s[48:49] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[48:49] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX9-DPP-NEXT: .LBB17_3: ; GFX9-DPP-NEXT: s_endpgm @@ -13702,34 +13892,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -13771,10 +13963,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) @@ -13786,34 +13978,35 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-DPP-NEXT: s_clause 0x1 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1064-DPP-NEXT: .LBB17_3: ; GFX1064-DPP-NEXT: s_endpgm @@ -13824,34 +14017,36 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 @@ -13883,14 +14078,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) @@ -13902,60 +14097,63 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-DPP-NEXT: s_clause 0x1 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s46 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1032-DPP-NEXT: .LBB17_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b32 s43, s8 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -14009,10 +14207,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start @@ -14024,23 +14222,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: s_clause 0x1 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -14048,8 +14246,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[46:47] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1164-DPP-NEXT: .LBB17_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -14057,8 +14255,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -14066,10 +14264,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX1132-DPP-NEXT: s_mov_b32 s42, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s43, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 @@ -14077,6 +14276,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 @@ -14118,14 +14318,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43] +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start @@ -14137,28 +14337,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_clause 0x1 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s46, vcc_lo, s46 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s46 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2 ; GFX1132-DPP-NEXT: .LBB17_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll index e41634402c0c2b..d590baa771fe49 100644 --- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll @@ -71,7 +71,7 @@ bb: ; uniform load dominated by no-alias store - scalarize ; CHECK-LABEL: @no_memdep_alias_arg -; CHECK: s_load_dwordx2 s[[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[6:7], 0x0 +; CHECK: s_load_dwordx2 s[[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[8:9], 0x0 ; CHECK: s_load_dword [[SVAL:s[0-9]+]], s[[[IN_LO]]:[[IN_HI]]], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] @@ -100,7 +100,7 @@ define amdgpu_kernel void @memdep(ptr addrspace(1) %in, [8 x i32], ptr addrspace ; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]] ; CHECK-DAG: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0 ; CHECK-DAG: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 -; CHECK-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[6:7], 0x0 +; CHECK-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[8:9], 0x0 ; CHECK-DAG: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] diff --git a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll index e1d253db9fce1c..6c921441c972d3 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll +++ b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll @@ -30,12 +30,12 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; REVERSEXNACK-LABEL: shuffle_v4f16_234u: ; REVERSEXNACK: ; %bb.0: ; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v3 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v1 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v0 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v2 -; REVERSEXNACK-NEXT: global_load_dword v0, v[3:4], off offset:4 -; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[5:6], off +; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v1 +; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v0 +; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v3 +; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v2 +; REVERSEXNACK-NEXT: global_load_dword v0, v[5:6], off offset:4 +; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[3:4], off ; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) ; REVERSEXNACK-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 1109a0a25349f7..fbb54893d9b2ac 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: load_f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -19,8 +19,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; ; VI-LABEL: load_f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -31,10 +31,10 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; GFX11-LABEL: load_f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm store half %arg, ptr addrspace(1) %out @@ -44,8 +44,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: load_v2f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -55,8 +55,8 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; ; VI-LABEL: load_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -67,10 +67,10 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; GFX11-LABEL: load_v2f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm store <2 x half> %arg, ptr addrspace(1) %out @@ -80,7 +80,7 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CIVI-LABEL: load_v3f16_arg: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_add_u32 s4, s0, 4 ; CIVI-NEXT: s_addc_u32 s5, s1, 0 @@ -96,7 +96,7 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; ; GFX11-LABEL: load_v3f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 @@ -113,7 +113,7 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CIVI-LABEL: load_v4f16_arg: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 @@ -124,7 +124,7 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; ; GFX11-LABEL: load_v4f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -137,8 +137,8 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: load_v8f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -151,8 +151,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; ; VI-LABEL: load_v8f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -166,13 +166,13 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; GFX11-LABEL: load_v8f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm store <8 x half> %arg, ptr addrspace(1) %out ret void @@ -181,8 +181,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: extload_v2f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -194,8 +194,8 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; ; VI-LABEL: extload_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -208,13 +208,13 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; GFX11-LABEL: extload_v2f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %fpext = fpext <2 x half> %in to <2 x float> @@ -225,8 +225,8 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: extload_f16_to_f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -236,8 +236,8 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; ; VI-LABEL: extload_f16_to_f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -248,11 +248,11 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; GFX11-LABEL: extload_f16_to_f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %ext = fpext half %arg to float @@ -263,8 +263,8 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: extload_v2f16_to_v2f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -276,8 +276,8 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; ; VI-LABEL: extload_v2f16_to_v2f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -290,13 +290,13 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; GFX11-LABEL: extload_v2f16_to_v2f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %ext = fpext <2 x half> %arg to <2 x float> @@ -307,7 +307,7 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 @@ -320,7 +320,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -333,7 +333,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 @@ -350,7 +350,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: s_lshr_b32 s5, s2, 16 @@ -365,7 +365,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -380,7 +380,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s3, 16 @@ -399,8 +399,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: extload_v8f16_to_v8f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s1, 16 ; CI-NEXT: s_lshr_b32 s7, s0, 16 @@ -427,8 +427,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; ; VI-LABEL: extload_v8f16_to_v8f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s1, 16 ; VI-NEXT: s_lshr_b32 s7, s0, 16 @@ -456,25 +456,25 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; GFX11-LABEL: extload_v8f16_to_v8f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s8, s7, 16 -; GFX11-NEXT: s_lshr_b32 s9, s6, 16 -; GFX11-NEXT: s_lshr_b32 s2, s5, 16 -; GFX11-NEXT: s_lshr_b32 s3, s4, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s7 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX11-NEXT: s_lshr_b32 s8, s3, 16 +; GFX11-NEXT: s_lshr_b32 s9, s2, 16 +; GFX11-NEXT: s_lshr_b32 s6, s1, 16 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, s2 ; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s8 ; GFX11-NEXT: v_cvt_f32_f16_e32 v5, s9 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s5 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s7 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm %ext = fpext <8 x half> %arg to <8 x float> store <8 x float> %ext, ptr addrspace(1) %out @@ -484,10 +484,10 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: extload_f16_to_f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[8:9], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -497,10 +497,10 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; ; VI-LABEL: extload_f16_to_f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x8 +; VI-NEXT: s_load_dword s0, s[8:9], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -510,11 +510,11 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; ; GFX11-LABEL: extload_f16_to_f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -528,12 +528,12 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: extload_v2f16_to_v2f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[8:9], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -544,12 +544,12 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; ; VI-LABEL: extload_v2f16_to_v2f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x8 +; VI-NEXT: s_load_dword s0, s[8:9], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -560,13 +560,13 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; ; GFX11-LABEL: extload_v2f16_to_v2f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 @@ -581,7 +581,7 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: s_lshr_b32 s4, s2, 16 @@ -602,7 +602,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 @@ -623,7 +623,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -647,7 +647,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -672,7 +672,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -697,7 +697,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s5, s3, 16 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 @@ -724,8 +724,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: extload_v8f16_to_v8f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 @@ -771,8 +771,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; ; VI-LABEL: extload_v8f16_to_v8f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 @@ -818,20 +818,20 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; ; GFX11-LABEL: extload_v8f16_to_v8f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s9, s7, 16 -; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s1, s5, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s7 +; GFX11-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-NEXT: s_lshr_b32 s7, s1, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v11, s9 -; GFX11-NEXT: s_lshr_b32 s0, s4, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s6 +; GFX11-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s2 ; GFX11-NEXT: v_cvt_f32_f16_e32 v10, s8 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s5 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s7 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s6 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v6 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v11 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 @@ -840,7 +840,7 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 @@ -857,7 +857,7 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -870,7 +870,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: global_load_store_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -885,7 +885,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_v2f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -898,7 +898,7 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: global_load_store_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -913,7 +913,7 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; CIVI-LABEL: global_load_store_v4f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -926,7 +926,7 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add ; ; GFX11-LABEL: global_load_store_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -941,7 +941,7 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_v8f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -954,7 +954,7 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: global_load_store_v8f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -969,7 +969,7 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_extload_f16_to_f32: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -983,7 +983,7 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_extload_f16_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1000,7 +1000,7 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v2f16_to_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v2f16_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v2f16_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v2, s[2:3] @@ -1051,7 +1051,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v3f16_to_v3f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1068,7 +1068,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v3f16_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1084,7 +1084,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v3f16_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v3, s[2:3] @@ -1105,7 +1105,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v4f16_to_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1124,7 +1124,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v4f16_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1141,7 +1141,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v4f16_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] @@ -1164,7 +1164,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v8f16_to_v8f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1194,7 +1194,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v8f16_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1220,7 +1220,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v8f16_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] @@ -1250,7 +1250,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v16f16_to_v16f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -1308,7 +1308,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; ; VI-LABEL: global_extload_v16f16_to_v16f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1358,7 +1358,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; ; GFX11-LABEL: global_extload_v16f16_to_v16f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v20, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1405,7 +1405,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_extload_f16_to_f64: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1420,7 +1420,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_extload_f16_to_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1439,7 +1439,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v2f16_to_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v2f16_to_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1474,7 +1474,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v2f16_to_v2f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v4, s[2:3] @@ -1497,7 +1497,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v3f16_to_v3f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1522,7 +1522,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v3f16_to_v3f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1546,7 +1546,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v3f16_to_v3f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v4f16_to_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1601,7 +1601,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v4f16_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1627,7 +1627,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v4f16_to_v4f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] @@ -1658,7 +1658,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v8f16_to_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1706,7 +1706,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v8f16_to_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1750,7 +1750,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v8f16_to_v8f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] @@ -1790,7 +1790,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v16f16_to_v16f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1885,7 +1885,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; ; VI-LABEL: global_extload_v16f16_to_v16f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1971,7 +1971,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; ; GFX11-LABEL: global_extload_v16f16_to_v16f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2038,7 +2038,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_truncstore_f32_to_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2052,7 +2052,7 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p ; ; GFX11-LABEL: global_truncstore_f32_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2069,7 +2069,7 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v2f32_to_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2086,7 +2086,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v2f32_to_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2102,7 +2102,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v2f32_to_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -2122,7 +2122,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v3f32_to_v3f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2145,7 +2145,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v3f32_to_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2167,7 +2167,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v3f32_to_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b96 v[0:2], v3, s[2:3] @@ -2190,7 +2190,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v4f32_to_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2211,7 +2211,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v4f32_to_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2230,7 +2230,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v4f32_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v8f32_to_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2288,7 +2288,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v8f32_to_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2319,7 +2319,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v8f32_to_v8f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2351,7 +2351,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v16f32_to_v16f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -2420,7 +2420,7 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; ; VI-LABEL: global_truncstore_v16f32_to_v16f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -2480,7 +2480,7 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; ; GFX11-LABEL: global_truncstore_v16f32_to_v16f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 @@ -2530,12 +2530,12 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 { ; CI-LABEL: fadd_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[8:9], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2546,8 +2546,8 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; ; VI-LABEL: fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2560,13 +2560,13 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; GFX11-LABEL: fadd_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_add_f16_e64 v1, s4, s2 +; GFX11-NEXT: v_add_f16_e64 v1, s2, s3 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %add = fadd half %a, %b @@ -2577,7 +2577,7 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x half> %b) #0 { ; CI-LABEL: fadd_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2598,7 +2598,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -2615,7 +2615,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; GFX11-LABEL: fadd_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, s2, s3 @@ -2629,7 +2629,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: fadd_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2666,7 +2666,7 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: fadd_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2685,7 +2685,7 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX11-LABEL: fadd_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -2705,33 +2705,33 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 { ; CI-LABEL: fadd_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 +; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s2, s8, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: s_lshr_b32 s2, s11, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; CI-NEXT: s_lshr_b32 s2, s12, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v8, s2 -; CI-NEXT: s_lshr_b32 s2, s13, 16 -; CI-NEXT: s_lshr_b32 s3, s9, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v9, s2 -; CI-NEXT: s_lshr_b32 s2, s14, 16 -; CI-NEXT: s_lshr_b32 s4, s10, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v10, s2 -; CI-NEXT: s_lshr_b32 s2, s15, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; CI-NEXT: v_cvt_f32_f16_e32 v5, s9 -; CI-NEXT: v_cvt_f32_f16_e32 v11, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v12, s12 -; CI-NEXT: v_cvt_f32_f16_e32 v13, s13 -; CI-NEXT: v_cvt_f32_f16_e32 v6, s10 -; CI-NEXT: v_cvt_f32_f16_e32 v7, s11 -; CI-NEXT: v_cvt_f32_f16_e32 v14, s15 -; CI-NEXT: v_cvt_f32_f16_e32 v15, s14 +; CI-NEXT: s_lshr_b32 s10, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v8, s0 +; CI-NEXT: s_lshr_b32 s0, s5, 16 +; CI-NEXT: s_lshr_b32 s11, s1, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 +; CI-NEXT: s_lshr_b32 s0, s6, 16 +; CI-NEXT: s_lshr_b32 s12, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s10 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s11 +; CI-NEXT: s_lshr_b32 s10, s3, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v10, s0 +; CI-NEXT: s_lshr_b32 s0, s7, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s10 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v11, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v13, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v7, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v14, s7 +; CI-NEXT: v_cvt_f32_f16_e32 v15, s6 ; CI-NEXT: v_add_f32_e32 v1, v1, v9 ; CI-NEXT: v_add_f32_e32 v0, v0, v8 ; CI-NEXT: v_add_f32_e32 v3, v3, v11 @@ -2754,66 +2754,66 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v1, v5, v1 ; CI-NEXT: v_or_b32_e32 v0, v4, v0 -; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, s8 ; CI-NEXT: v_or_b32_e32 v3, v7, v3 ; CI-NEXT: v_or_b32_e32 v2, v6, v2 -; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: v_mov_b32_e32 v5, s9 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: fadd_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s15, 16 -; VI-NEXT: s_lshr_b32 s3, s11, 16 -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s3, 16 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s11, v0 -; VI-NEXT: s_lshr_b32 s2, s14, 16 -; VI-NEXT: s_lshr_b32 s3, s10, 16 +; VI-NEXT: v_add_f16_e32 v0, s3, v0 +; VI-NEXT: s_lshr_b32 s3, s6, 16 +; VI-NEXT: s_lshr_b32 s7, s2, 16 ; VI-NEXT: v_or_b32_e32 v3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: v_add_f16_e32 v1, s10, v1 -; VI-NEXT: s_lshr_b32 s2, s13, 16 -; VI-NEXT: s_lshr_b32 s3, s9, 16 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_add_f16_e32 v1, s2, v1 +; VI-NEXT: s_lshr_b32 s2, s5, 16 +; VI-NEXT: s_lshr_b32 s3, s1, 16 ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: v_add_f16_e32 v1, s9, v1 -; VI-NEXT: s_lshr_b32 s2, s12, 16 -; VI-NEXT: s_lshr_b32 s3, s8, 16 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_f16_e32 v1, s1, v1 +; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: s_lshr_b32 s2, s0, 16 ; VI-NEXT: v_or_b32_e32 v1, v1, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_add_f16_e32 v4, s8, v4 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_e32 v4, s0, v4 ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fadd_v8f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, s7, s11 -; GFX11-NEXT: v_pk_add_f16 v2, s6, s10 -; GFX11-NEXT: v_pk_add_f16 v1, s5, s9 -; GFX11-NEXT: v_pk_add_f16 v0, s4, s8 +; GFX11-NEXT: v_pk_add_f16 v3, s11, s15 +; GFX11-NEXT: v_pk_add_f16 v2, s10, s14 +; GFX11-NEXT: v_pk_add_f16 v1, s9, s13 +; GFX11-NEXT: v_pk_add_f16 v0, s8, s12 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %add = fadd <8 x half> %a, %b @@ -2824,7 +2824,7 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; CIVI-LABEL: test_bitcast_from_half: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -2837,7 +2837,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr ; ; GFX11-LABEL: test_bitcast_from_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2853,7 +2853,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: test_bitcast_to_half: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2866,7 +2866,7 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_bitcast_to_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index a4a8f43646d4ba..5dff660912e402 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -318,8 +318,8 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41] ; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43] ; CHECK-NEXT: ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 -; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 ; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11 +; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_sub_f32_e32 v1, v4, v3 ; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 35b6bfbee111fb..5fab0c50bbe574 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: udiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -53,8 +53,8 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: udiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b32 s7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -101,8 +101,8 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: udiv32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -170,8 +170,8 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: urem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -215,8 +215,8 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: urem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b32 s7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -261,8 +261,8 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: urem32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -329,14 +329,14 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: sdiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s4, s5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_ashr_i32 s3, s5, 31 -; GFX9-NEXT: s_sub_i32 s5, 0, s4 +; GFX9-NEXT: s_abs_i32 s2, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_ashr_i32 s4, s6, 31 +; GFX9-NEXT: s_sub_i32 s5, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -347,25 +347,25 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB2_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mul_hi_u32 s6, s2, s5 -; GFX9-NEXT: s_mul_i32 s7, s6, s4 -; GFX9-NEXT: s_sub_i32 s7, s2, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s3, s5 +; GFX9-NEXT: s_mul_i32 s7, s6, s2 +; GFX9-NEXT: s_sub_i32 s7, s3, s7 ; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_sub_i32 s9, s7, s4 -; GFX9-NEXT: s_cmp_ge_u32 s7, s4 +; GFX9-NEXT: s_sub_i32 s9, s7, s2 +; GFX9-NEXT: s_cmp_ge_u32 s7, s2 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 ; GFX9-NEXT: s_cselect_b32 s7, s9, s7 ; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s4 +; GFX9-NEXT: s_cmp_ge_u32 s7, s2 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 -; GFX9-NEXT: s_xor_b32 s6, s6, s3 -; GFX9-NEXT: s_sub_i32 s6, s6, s3 -; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: s_sub_i32 s6, s6, s4 +; GFX9-NEXT: s_add_i32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -373,44 +373,44 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: sdiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_abs_i32 s4, s5 -; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s3, 0, s4 +; GFX10-NEXT: s_abs_i32 s2, s3 +; GFX10-NEXT: s_ashr_i32 s3, s3, 31 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_sub_i32 s4, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s6, v0 +; GFX10-NEXT: v_readfirstlane_b32 s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s3, s3, s6 -; GFX10-NEXT: s_mul_hi_u32 s5, s6, s3 -; GFX10-NEXT: s_mov_b32 s3, 0 -; GFX10-NEXT: s_add_i32 s5, s6, s5 +; GFX10-NEXT: s_mul_i32 s4, s4, s5 +; GFX10-NEXT: s_mul_hi_u32 s6, s5, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_add_i32 s5, s5, s6 ; GFX10-NEXT: .LBB2_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_mul_hi_u32 s6, s3, s5 -; GFX10-NEXT: s_mul_i32 s7, s6, s4 +; GFX10-NEXT: s_mul_hi_u32 s6, s4, s5 +; GFX10-NEXT: s_mul_i32 s7, s6, s2 ; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_sub_i32 s7, s3, s7 -; GFX10-NEXT: s_sub_i32 s9, s7, s4 -; GFX10-NEXT: s_cmp_ge_u32 s7, s4 +; GFX10-NEXT: s_sub_i32 s7, s4, s7 +; GFX10-NEXT: s_sub_i32 s9, s7, s2 +; GFX10-NEXT: s_cmp_ge_u32 s7, s2 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6 ; GFX10-NEXT: s_cselect_b32 s7, s9, s7 ; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_cmp_ge_u32 s7, s4 +; GFX10-NEXT: s_cmp_ge_u32 s7, s2 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6 -; GFX10-NEXT: s_add_i32 s3, s3, 1 -; GFX10-NEXT: s_xor_b32 s6, s6, s2 -; GFX10-NEXT: s_sub_i32 s6, s6, s2 +; GFX10-NEXT: s_add_i32 s4, s4, 1 +; GFX10-NEXT: s_xor_b32 s6, s6, s3 +; GFX10-NEXT: s_sub_i32 s6, s6, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -418,51 +418,51 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: sdiv32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_abs_i32 s4, s5 -; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX11-NEXT: s_sub_i32 s3, 0, s4 +; GFX11-NEXT: s_abs_i32 s2, s3 +; GFX11-NEXT: s_ashr_i32 s3, s3, 31 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: s_sub_i32 s4, 0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s6, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mul_i32 s3, s3, s6 +; GFX11-NEXT: s_mul_i32 s4, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s5, s6, s3 -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_add_i32 s5, s6, s5 +; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s5, s5, s6 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB2_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s6, s3, s5 -; GFX11-NEXT: s_mul_i32 s7, s6, s4 +; GFX11-NEXT: s_mul_hi_u32 s6, s4, s5 +; GFX11-NEXT: s_mul_i32 s7, s6, s2 ; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_sub_i32 s7, s3, s7 +; GFX11-NEXT: s_sub_i32 s7, s4, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s9, s7, s4 -; GFX11-NEXT: s_cmp_ge_u32 s7, s4 +; GFX11-NEXT: s_sub_i32 s9, s7, s2 +; GFX11-NEXT: s_cmp_ge_u32 s7, s2 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6 ; GFX11-NEXT: s_cselect_b32 s7, s9, s7 ; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_cmp_ge_u32 s7, s4 +; GFX11-NEXT: s_cmp_ge_u32 s7, s2 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6 -; GFX11-NEXT: s_add_i32 s3, s3, 1 -; GFX11-NEXT: s_xor_b32 s6, s6, s2 +; GFX11-NEXT: s_add_i32 s4, s4, 1 +; GFX11-NEXT: s_xor_b32 s6, s6, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s6, s6, s2 +; GFX11-NEXT: s_sub_i32 s6, s6, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm @@ -486,126 +486,126 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: srem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s4, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_sub_i32 s3, 0, s4 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_abs_i32 s2, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s3, s3, s5 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 -; GFX9-NEXT: s_add_i32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s4, s5, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s3 -; GFX9-NEXT: s_mul_i32 s5, s5, s4 -; GFX9-NEXT: s_sub_i32 s5, s2, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s4 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4 +; GFX9-NEXT: s_mul_i32 s5, s5, s2 +; GFX9-NEXT: s_sub_i32 s5, s3, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s2 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s4 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_sub_i32 s6, s5, s2 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: s_add_i32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: srem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_abs_i32 s4, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s2, 0, s4 +; GFX10-NEXT: s_abs_i32 s2, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s2, s2, s3 -; GFX10-NEXT: s_mul_hi_u32 s5, s3, s2 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_add_i32 s3, s3, s5 +; GFX10-NEXT: s_mul_i32 s3, s3, s4 +; GFX10-NEXT: s_mul_hi_u32 s5, s4, s3 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_add_i32 s4, s4, s5 ; GFX10-NEXT: .LBB3_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_mul_hi_u32 s5, s2, s3 -; GFX10-NEXT: s_mul_i32 s5, s5, s4 -; GFX10-NEXT: s_sub_i32 s5, s2, s5 -; GFX10-NEXT: s_sub_i32 s6, s5, s4 -; GFX10-NEXT: s_cmp_ge_u32 s5, s4 +; GFX10-NEXT: s_mul_hi_u32 s5, s3, s4 +; GFX10-NEXT: s_mul_i32 s5, s5, s2 +; GFX10-NEXT: s_sub_i32 s5, s3, s5 +; GFX10-NEXT: s_sub_i32 s6, s5, s2 +; GFX10-NEXT: s_cmp_ge_u32 s5, s2 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_sub_i32 s6, s5, s4 -; GFX10-NEXT: s_cmp_ge_u32 s5, s4 +; GFX10-NEXT: s_sub_i32 s6, s5, s2 +; GFX10-NEXT: s_cmp_ge_u32 s5, s2 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: s_add_i32 s3, s3, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX10-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: srem32_invariant_denom: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_abs_i32 s4, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX11-NEXT: s_sub_i32 s2, 0, s4 +; GFX11-NEXT: s_abs_i32 s2, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: s_sub_i32 s3, 0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mul_i32 s2, s2, s3 +; GFX11-NEXT: s_mul_i32 s3, s3, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s5, s3, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_add_i32 s3, s3, s5 +; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_add_i32 s4, s4, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB3_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s5, s2, s3 -; GFX11-NEXT: s_mul_i32 s5, s5, s4 +; GFX11-NEXT: s_mul_hi_u32 s5, s3, s4 +; GFX11-NEXT: s_mul_i32 s5, s5, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s5, s2, s5 -; GFX11-NEXT: s_sub_i32 s6, s5, s4 -; GFX11-NEXT: s_cmp_ge_u32 s5, s4 +; GFX11-NEXT: s_sub_i32 s5, s3, s5 +; GFX11-NEXT: s_sub_i32 s6, s5, s2 +; GFX11-NEXT: s_cmp_ge_u32 s5, s2 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s6, s5, s4 -; GFX11-NEXT: s_cmp_ge_u32 s5, s4 +; GFX11-NEXT: s_sub_i32 s6, s5, s2 +; GFX11-NEXT: s_cmp_ge_u32 s5, s2 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5 -; GFX11-NEXT: s_add_i32 s2, s2, 1 +; GFX11-NEXT: s_add_i32 s3, s3, 1 ; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm @@ -629,12 +629,12 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -660,10 +660,10 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: udiv16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s2, s4, 0xffff +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -690,10 +690,10 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: udiv16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, 0 @@ -742,20 +742,20 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: urem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s0, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_and_b32 s2, s0, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_and_b32 s3, 0xffff, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3 -; GFX9-NEXT: s_add_i32 s2, s2, 1 -; GFX9-NEXT: s_lshl_b32 s5, s3, 1 -; GFX9-NEXT: s_and_b32 s6, s2, 0xffff +; GFX9-NEXT: s_and_b32 s4, 0xffff, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX9-NEXT: s_add_i32 s3, s3, 1 +; GFX9-NEXT: s_lshl_b32 s5, s4, 1 +; GFX9-NEXT: s_and_b32 s6, s3, 0xffff ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 @@ -763,9 +763,9 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 -; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s4, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] ; GFX9-NEXT: s_cbranch_scc0 .LBB5_1 @@ -775,11 +775,11 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: urem16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s2, s4, 0xffff +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB5_1: ; %bb3 @@ -807,11 +807,11 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: urem16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -862,33 +862,33 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: sdiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_sext_i32_i16 s2, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: s_ashr_i32 s3, s3, 30 -; GFX9-NEXT: s_or_b32 s3, s3, 1 +; GFX9-NEXT: s_sext_i32_i16 s4, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX9-NEXT: s_xor_b32 s4, s4, s2 +; GFX9-NEXT: s_ashr_i32 s4, s4, 30 +; GFX9-NEXT: s_or_b32 s6, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX9-NEXT: s_cselect_b32 s3, s3, 0 -; GFX9-NEXT: s_and_b32 s5, 0xffff, s2 -; GFX9-NEXT: s_add_i32 s2, s2, 1 -; GFX9-NEXT: v_add_u32_e32 v2, s3, v3 -; GFX9-NEXT: s_lshl_b32 s3, s5, 1 -; GFX9-NEXT: s_and_b32 s5, s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s6, 0 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s3 +; GFX9-NEXT: s_add_i32 s3, s3, 1 +; GFX9-NEXT: v_add_u32_e32 v2, s4, v3 +; GFX9-NEXT: s_lshl_b32 s4, s5, 1 +; GFX9-NEXT: s_and_b32 s5, s3, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] @@ -899,11 +899,11 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: sdiv16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s2, s4 +; GFX10-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB6_1: ; %bb3 @@ -935,11 +935,11 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: sdiv16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -995,36 +995,36 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: srem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_sext_i32_i16 s2, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s5, s3, s4 -; GFX9-NEXT: s_ashr_i32 s5, s5, 30 -; GFX9-NEXT: s_or_b32 s5, s5, 1 +; GFX9-NEXT: s_sext_i32_i16 s6, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GFX9-NEXT: s_xor_b32 s4, s6, s2 +; GFX9-NEXT: s_ashr_i32 s4, s4, 30 +; GFX9-NEXT: s_or_b32 s7, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX9-NEXT: s_cselect_b32 s5, s5, 0 -; GFX9-NEXT: v_add_u32_e32 v2, s5, v3 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 -; GFX9-NEXT: s_and_b32 s6, 0xffff, s2 -; GFX9-NEXT: s_add_i32 s2, s2, 1 -; GFX9-NEXT: s_lshl_b32 s5, s6, 1 -; GFX9-NEXT: s_and_b32 s6, s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 -; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: v_add_u32_e32 v2, s4, v3 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s3 +; GFX9-NEXT: s_add_i32 s3, s3, 1 +; GFX9-NEXT: s_lshl_b32 s4, s5, 1 +; GFX9-NEXT: s_and_b32 s5, s3, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400 +; GFX9-NEXT: v_sub_u32_e32 v2, s6, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] ; GFX9-NEXT: s_cbranch_scc0 .LBB7_1 @@ -1034,11 +1034,11 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: srem16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s2, s4 +; GFX10-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB7_1: ; %bb3 @@ -1072,11 +1072,11 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: srem16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index 011a366267afe1..013eb8f59bbb9c 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -12,20 +12,20 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -33,26 +33,26 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -60,61 +60,62 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -150,20 +151,20 @@ entry: define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MulMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -173,25 +174,25 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_MulMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -199,66 +200,67 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_MulMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s0 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MulMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MulMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s2 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -290,20 +292,20 @@ entry: define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -311,26 +313,26 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 +; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -338,61 +340,62 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2 -; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -425,20 +428,20 @@ entry: define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MixedTypedMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 @@ -446,26 +449,26 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot2_MixedTypedMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -473,67 +476,68 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MixedTypedMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedTypedMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedTypedMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -566,20 +570,20 @@ entry: define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_alt_AddOperands: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -587,26 +591,26 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_alt_AddOperands: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -614,22 +618,22 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_alt_AddOperands: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 @@ -638,41 +642,42 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_alt_AddOperands: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -705,20 +710,20 @@ entry: define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MixedExt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -726,26 +731,26 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 +; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot2_MixedExt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -753,67 +758,68 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MixedExt: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedExt: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedExt: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -846,114 +852,115 @@ entry: define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_SameVec: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v0, v0, s4 +; GFX7-NEXT: v_mad_u32_u24 v0, v0, v0, s0 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: notudot2_SameVec: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, v1, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notudot2_SameVec: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_SameVec: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v2, s2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v2, s0, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_SameVec: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -986,21 +993,21 @@ entry: define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_v4i16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11] ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1008,26 +1015,26 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s4 +; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_v4i16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1035,61 +1042,62 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_v4i16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1122,20 +1130,20 @@ entry: define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_v4i16_Hi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -1143,22 +1151,22 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s4 +; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_v4i16_Hi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1166,7 +1174,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -1174,61 +1182,62 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_v4i16_Hi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16_Hi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1261,21 +1270,21 @@ entry: define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_v4i16_Even: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11] ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1283,26 +1292,26 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s0 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: notudot2_v4i16_Even: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1310,67 +1319,68 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2 +; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notudot2_v4i16_Even: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX9-NODL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX9-NODL-NEXT: global_store_dword v4, v0, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Even: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX9-DL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX9-DL-NEXT: global_store_dword v4, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Even: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1403,21 +1413,21 @@ entry: define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_v4i16_Middle: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3] +; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11] ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1425,26 +1435,26 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s0 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: notudot2_v4i16_Middle: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1452,67 +1462,68 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2 +; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notudot2_v4i16_Middle: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX9-NODL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX9-NODL-NEXT: global_store_dword v4, v0, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Middle: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX9-DL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX9-DL-NEXT: global_store_dword v4, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Middle: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] -; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1545,20 +1556,20 @@ entry: define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_DiffIndex: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -1566,26 +1577,26 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, s0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v2, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: notudot2_DiffIndex: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -1593,67 +1604,68 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notudot2_DiffIndex: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_DiffIndex: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_DiffIndex: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1686,20 +1698,20 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -1707,27 +1719,27 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_MultipleUses_add1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -1735,65 +1747,66 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 ; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -1801,9 +1814,9 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1838,20 +1851,20 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -1859,27 +1872,27 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 +; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0 ; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot2_MultipleUses_add1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -1887,65 +1900,66 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 ; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -1953,9 +1967,9 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1990,20 +2004,20 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -2011,27 +2025,27 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s4 +; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s0 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_MultipleUses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2039,23 +2053,23 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s2 +; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s0 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 @@ -2064,20 +2078,20 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1 @@ -2086,22 +2100,23 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -2109,10 +2124,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2148,20 +2163,20 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -2169,27 +2184,27 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s4 +; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s0 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot2_MultipleUses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -2197,23 +2212,23 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s2 +; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s0 ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16 @@ -2222,20 +2237,20 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16 @@ -2244,22 +2259,23 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -2267,10 +2283,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2306,48 +2322,48 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_mul2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_MultipleUses_mul2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2355,23 +2371,23 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2379,20 +2395,20 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2400,22 +2416,23 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -2423,10 +2440,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2462,20 +2479,20 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_mul2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -2483,27 +2500,27 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s4 +; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s0 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot2_MultipleUses_mul2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -2511,23 +2528,23 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0 ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2535,20 +2552,20 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2556,22 +2573,23 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -2579,10 +2597,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2618,20 +2636,20 @@ entry: define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -2641,25 +2659,25 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 @@ -2673,14 +2691,14 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[6:7] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -2688,19 +2706,19 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-NODL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_store_short v1, v0, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ushort v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -2708,21 +2726,21 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_short v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -2730,7 +2748,7 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2759,20 +2777,20 @@ entry: define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; GFX7-LABEL: notsdot2_sext8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 @@ -2780,26 +2798,26 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 8 ; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 +; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: notsdot2_sext8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_ushort v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 @@ -2809,22 +2827,22 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: notsdot2_sext8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -2832,50 +2850,51 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notsdot2_sext8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0001 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0001 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notsdot2_sext8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0001 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0001 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 -; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v2, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index d2247e0aa20890..1b181643b3d469 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -10,18 +10,18 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -44,19 +44,19 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 @@ -66,26 +66,26 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 ; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 ; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8 ; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 ; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -93,62 +93,62 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s2, v4 +; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0] -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -195,18 +195,18 @@ entry: define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -237,20 +237,20 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_bfe_i32 v7, v3, 0, 8 @@ -278,14 +278,14 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[6:7] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_bfe_i32 v6, v1, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -307,57 +307,57 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_sshort v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_sshort v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_short v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_sshort v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_sshort v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v2, v3 -; GFX10-DL-NEXT: global_store_short v1, v4, s[0:1] +; GFX10-DL-NEXT: global_store_short v1, v4, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc16: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: global_load_i16 v3, v1, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: global_load_i16 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v2, v0, v3 neg_lo:[1,1,0] -; GFX11-DL-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b16 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -404,18 +404,18 @@ entry: define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -438,20 +438,20 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 @@ -471,14 +471,14 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 @@ -492,57 +492,57 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc8: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 -; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -581,18 +581,18 @@ entry: define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_multiuse_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -616,19 +616,19 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_multiuse_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 @@ -638,27 +638,27 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 ; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v8, v1, v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v8, v1, v2, s0 ; GFX8-NEXT: v_mad_i32_i24 v4, v4, v5, v8 ; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8 ; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 ; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -669,77 +669,77 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v3, v4 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s0 ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_multiuse_mul1: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_bfe_i32 v3, v0, 0, 8 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, v3, s2 +; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, v3, s0 ; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0] -; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b32 v3, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -788,18 +788,18 @@ entry: define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -822,19 +822,19 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v3 @@ -848,24 +848,24 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s0 ; GFX8-NEXT: v_mad_i32_i24 v0, v1, v2, v0 ; GFX8-NEXT: v_mad_i32_i24 v0, v5, v7, v0 ; GFX8-NEXT: v_mad_i32_i24 v2, v4, v6, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 @@ -876,62 +876,62 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v5, s2, v3 +; GFX9-NODL-NEXT: v_add3_u32 v2, v5, s0, v3 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0] -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -964,18 +964,18 @@ entry: define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -1006,20 +1006,20 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 @@ -1043,15 +1043,15 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-NODL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1060,35 +1060,35 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s2 -; GFX9-NODL-NEXT: v_perm_b32 v1, v6, v1, s2 +; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s0 +; GFX9-NODL-NEXT: v_perm_b32 v1, v6, v1, s0 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: v_perm_b32 v5, v9, v5, s2 -; GFX9-NODL-NEXT: v_perm_b32 v4, v8, v4, s2 +; GFX9-NODL-NEXT: v_perm_b32 v5, v9, v5, s0 +; GFX9-NODL-NEXT: v_perm_b32 v4, v8, v4, s0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1097,36 +1097,36 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s2 -; GFX9-DL-NEXT: v_perm_b32 v1, v6, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v6, v1, s0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 ; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_perm_b32 v5, v9, v5, s2 -; GFX9-DL-NEXT: v_perm_b32 v4, v8, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v5, v9, v5, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v8, v4, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_ashrrev_i16 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1152,23 +1152,22 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc16_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(2) ; GFX11-DL-NEXT: v_ashrrev_i16 v4, 8, v1 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) @@ -1200,7 +1199,7 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0 ; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3 -; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1233,20 +1232,20 @@ entry: define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_2ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 @@ -1254,26 +1253,26 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 8 ; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v1, v1, v3, s4 +; GFX7-NEXT: v_mad_i32_i24 v1, v1, v3, s0 ; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc32_2ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v3, v3, 8, 8 @@ -1281,95 +1280,95 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 ; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_2ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_2ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0100 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_2ele: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 -; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v2, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_2ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0] -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1403,18 +1402,18 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -1434,19 +1433,19 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 @@ -1455,24 +1454,24 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 ; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 ; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 ; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 ; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -1481,75 +1480,75 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 -; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v2, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0] -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1590,18 +1589,18 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele_permuted: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -1621,19 +1620,19 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele_permuted: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 24, v3 ; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8 @@ -1642,24 +1641,24 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 24, v0 ; GFX8-NEXT: v_bfe_i32 v5, v0, 0, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 ; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 ; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 ; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v3, 24, v1 @@ -1668,75 +1667,75 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020003 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020003 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020003 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020003 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 -; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v2, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020003 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020003 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0] -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1776,18 +1775,18 @@ entry: define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_opt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 @@ -1808,20 +1807,20 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_opt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v7, v3, 16, 8 @@ -1839,12 +1838,12 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_opt: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -1855,56 +1854,55 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_opt: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_opt: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_opt: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0] -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1949,7 +1947,7 @@ entry: define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -1986,7 +1984,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2024,13 +2022,13 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] -; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[12:13] +; GFX9-NODL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -2042,20 +2040,20 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_add3_u32 v2, v4, s0, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0x706010c ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0c00 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX9-DL-NEXT: s_load_dword s1, s[14:15], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 @@ -2064,19 +2062,19 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v3, v1, s1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[10:11] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[12:13] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0x706010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -2086,12 +2084,12 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v3, v0 -; GFX10-DL-NEXT: global_store_dword v2, v1, s[10:11] +; GFX10-DL-NEXT: global_store_dword v2, v1, s[14:15] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_3src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2158,7 +2156,7 @@ entry: define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3src_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -2192,7 +2190,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3src_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2227,13 +2225,13 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] -; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[12:13] +; GFX9-NODL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8 @@ -2244,21 +2242,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3src_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c ; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c00 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX9-DL-NEXT: s_load_dword s3, s[14:15], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 @@ -2268,19 +2266,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3src_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[10:11] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[12:13] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -2291,12 +2289,12 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0 -; GFX10-DL-NEXT: global_store_dword v3, v1, s[10:11] +; GFX10-DL-NEXT: global_store_dword v3, v1, s[14:15] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_3src_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2357,55 +2355,56 @@ entry: define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_bad_source: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0xf -; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: s_mov_b32 s14, 0 -; GFX7-NEXT: s_mov_b32 s15, s11 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dword s12, s[4:5], 0xf +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x11 -; GFX7-NEXT: s_sext_i32_i16 s0, s0 -; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 +; GFX7-NEXT: s_sext_i32_i16 s1, s12 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8 -; GFX7-NEXT: v_mad_i32_i24 v1, v3, s0, v1 +; GFX7-NEXT: v_mad_i32_i24 v1, v3, s1, v1 ; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8 ; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8 ; GFX7-NEXT: v_mad_i32_i24 v1, v4, v5, v1 ; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_bad_source: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; GFX8-NEXT: s_sext_i32_i16 s3, s6 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_sext_i32_i16 s3, s8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_waitcnt vmcnt(1) @@ -2425,15 +2424,16 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_bad_source: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dword s6, s[4:5], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_sext_i32_i16 s3, s8 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; GFX9-NODL-NEXT: s_sext_i32_i16 s3, s6 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -2449,16 +2449,17 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_bad_source: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dword s6, s[4:5], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0201 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_sext_i32_i16 s4, s8 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0201 +; GFX9-DL-NEXT: s_sext_i32_i16 s4, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 @@ -2475,16 +2476,18 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX10-DL-LABEL: idot4_bad_source: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x3c ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_sext_i32_i16 s3, s8 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; GFX10-DL-NEXT: s_sext_i32_i16 s3, s6 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 @@ -2500,17 +2503,18 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX11-DL-LABEL: idot4_bad_source: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b32 s8, s[2:3], 0x3c +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DL-NEXT: s_load_b32 s6, s[4:5], 0x3c ; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_sext_i32_i16 s3, s8 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX11-DL-NEXT: s_sext_i32_i16 s3, s6 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8 @@ -2568,18 +2572,18 @@ entry: define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_commutative: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xf ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -2599,19 +2603,19 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_commutative: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 @@ -2620,24 +2624,24 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 ; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 ; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 ; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 ; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_commutative: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -2646,75 +2650,75 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_commutative: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_commutative: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 -; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v2, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_commutative: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x3c ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0] -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -2759,7 +2763,7 @@ entry: define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3src_3ele_src0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -2792,7 +2796,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3src_3ele_src0: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2826,13 +2830,13 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[8:9] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX9-NODL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 8, 8 @@ -2843,21 +2847,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c ; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[12:13] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[10:11] +; GFX9-DL-NEXT: s_load_dword s3, s[14:15], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0 @@ -2867,19 +2871,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[10:11] +; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -2890,12 +2894,12 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0 -; GFX10-DL-NEXT: global_store_dword v3, v1, s[10:11] +; GFX10-DL-NEXT: global_store_dword v3, v1, s[14:15] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2958,25 +2962,25 @@ entry: define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_4src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GFX7-NEXT: s_mov_b32 s15, 0xf000 +; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s18, 0 -; GFX7-NEXT: s_mov_b32 s19, s15 +; GFX7-NEXT: s_mov_b32 s19, s3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[6:7] +; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11] ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9] +; GFX7-NEXT: s_mov_b64 s[16:17], s[12:13] ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11] +; GFX7-NEXT: s_mov_b64 s[16:17], s[14:15] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x11 -; GFX7-NEXT: s_mov_b32 s14, -1 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[12:13], 0x0 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 @@ -2984,7 +2988,7 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v5, v3, 0, 8 ; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s0 +; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v2, v4, 0, 8 ; GFX7-NEXT: v_bfe_i32 v4, v4, 8, 8 @@ -2994,29 +2998,29 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 ; GFX7-NEXT: v_mad_i32_i24 v1, v2, v4, v1 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_4src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s12, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -3044,14 +3048,14 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_4src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] -; GFX9-NODL-NEXT: global_load_dword v4, v0, s[10:11] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[12:13] +; GFX9-NODL-NEXT: global_load_dword v4, v0, s[14:15] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(3) @@ -3070,16 +3074,16 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_4src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501 ; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9] -; GFX9-DL-NEXT: global_load_dword v4, v0, s[10:11] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[12:13] +; GFX9-DL-NEXT: global_load_dword v4, v0, s[14:15] ; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0400 ; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s5, 0x4000c0c @@ -3099,15 +3103,15 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_4src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x3 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9] -; GFX10-DL-NEXT: global_load_dword v4, v0, s[10:11] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[12:13] +; GFX10-DL-NEXT: global_load_dword v4, v0, s[14:15] ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0501 @@ -3126,17 +3130,17 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_4src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x3 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[6:7] -; GFX11-DL-NEXT: global_load_b32 v3, v0, s[8:9] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[10:11] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[8:9] +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[10:11] +; GFX11-DL-NEXT: global_load_b32 v3, v0, s[12:13] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[14:15] ; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(2) ; GFX11-DL-NEXT: v_perm_b32 v4, v2, v1, 0xc0c0501 @@ -3208,18 +3212,18 @@ entry: define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_nonstandard_signed: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 @@ -3245,21 +3249,21 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_nonstandard_signed: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 @@ -3282,13 +3286,13 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_nonstandard_signed: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 @@ -3298,7 +3302,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 @@ -3307,18 +3311,18 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_nonstandard_signed: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 @@ -3328,7 +3332,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 @@ -3337,20 +3341,20 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_nonstandard_signed: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v6, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -3371,21 +3375,20 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v1, v2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_nonstandard_signed: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) @@ -3409,7 +3412,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -3449,268 +3452,5 @@ entry: ret void } -; The first (S0) operand of the v_dot4 is derived from the LHS of the mul chain (that is %op80, %op50). -; These correspond to the 0th, and 4th bytes starting from %inptr1. -; Confirm that we are actually accessing these bytes. -; -; Previously, we used the dword offset from the corresponding byte in the second (S1) operand. -; The result was to access the 0th byte instead of the 4th (i.e. a dword offset of 0 instead of 1). - -define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) %inptr1, i8 %l81, i8 %l51) { -; GFX7-LABEL: ByteOffsetCorrectness: -; GFX7: ; %bb.0: ; %.entry -; GFX7-NEXT: s_load_dword s0, s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s8, 0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_sext_i32_i8 s2, s0 -; GFX7-NEXT: s_bfe_i32 s3, s0, 0x80008 -; GFX7-NEXT: s_mov_b32 s9, s8 -; GFX7-NEXT: s_mov_b32 s10, s6 -; GFX7-NEXT: s_mov_b32 s11, s7 -; GFX7-NEXT: s_and_b64 s[0:1], exec, -1 -; GFX7-NEXT: .LBB17_1: ; %.lr.ph -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:4 -; GFX7-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 -; GFX7-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:1 -; GFX7-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2 -; GFX7-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:3 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_lo_u32 v1, v1, s2 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX7-NEXT: s_mov_b64 vcc, s[0:1] -; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; GFX7-NEXT: s_cbranch_vccnz .LBB17_1 -; GFX7-NEXT: ; %bb.2: ; %DummyReturnBlock -; GFX7-NEXT: s_endpgm -; -; GFX8-LABEL: ByteOffsetCorrectness: -; GFX8: ; %bb.0: ; %.entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v10, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, 0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s2, s6 -; GFX8-NEXT: s_add_u32 s4, s0, 4 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: s_bfe_i32 s3, s6, 0x80008 -; GFX8-NEXT: s_add_u32 s6, s0, 3 -; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: s_add_u32 s8, s0, 2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_add_u32 s0, s0, 1 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NEXT: v_mov_b32_e32 v9, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: v_mov_b32_e32 v6, s0 -; GFX8-NEXT: v_mov_b32_e32 v8, s6 -; GFX8-NEXT: s_and_b64 s[0:1], exec, -1 -; GFX8-NEXT: .LBB17_1: ; %.lr.ph -; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: flat_load_sbyte v12, v[0:1] -; GFX8-NEXT: flat_load_sbyte v13, v[2:3] -; GFX8-NEXT: flat_load_ubyte v14, v[4:5] -; GFX8-NEXT: flat_load_ubyte v15, v[6:7] -; GFX8-NEXT: flat_load_ubyte v16, v[8:9] -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_mul_lo_u32 v12, v12, s2 -; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_mul_lo_u32 v13, v13, s3 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_or_b32_e32 v14, v16, v14 -; GFX8-NEXT: v_or_b32_sdwa v13, v13, sext(v14) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13 -; GFX8-NEXT: s_mov_b64 vcc, s[0:1] -; GFX8-NEXT: flat_store_dword v[10:11], v12 -; GFX8-NEXT: s_cbranch_vccnz .LBB17_1 -; GFX8-NEXT: ; %bb.2: ; %DummyReturnBlock -; GFX8-NEXT: s_endpgm -; -; GFX9-NODL-LABEL: ByteOffsetCorrectness: -; GFX9-NODL: ; %bb.0: ; %.entry -; GFX9-NODL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s4 -; GFX9-NODL-NEXT: s_bfe_i32 s3, s4, 0x80008 -; GFX9-NODL-NEXT: s_and_b64 vcc, exec, -1 -; GFX9-NODL-NEXT: .LBB17_1: ; %.lr.ph -; GFX9-NODL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NODL-NEXT: global_load_sbyte v3, v2, s[0:1] -; GFX9-NODL-NEXT: global_load_sbyte v4, v2, s[0:1] offset:4 -; GFX9-NODL-NEXT: global_load_ubyte v5, v2, s[0:1] offset:3 -; GFX9-NODL-NEXT: global_load_ubyte v6, v2, s[0:1] offset:2 -; GFX9-NODL-NEXT: global_load_ubyte v7, v2, s[0:1] offset:1 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(4) -; GFX9-NODL-NEXT: v_mul_lo_u32 v3, v3, s2 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(3) -; GFX9-NODL-NEXT: v_mul_lo_u32 v4, v4, s3 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX9-NODL-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_add_u32_e32 v3, v3, v4 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v3, off -; GFX9-NODL-NEXT: s_mov_b64 vcc, vcc -; GFX9-NODL-NEXT: s_cbranch_vccnz .LBB17_1 -; GFX9-NODL-NEXT: ; %bb.2: ; %DummyReturnBlock -; GFX9-NODL-NEXT: s_endpgm -; -; GFX9-DL-LABEL: ByteOffsetCorrectness: -; GFX9-DL: ; %bb.0: ; %.entry -; GFX9-DL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0xc0c0400 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0400 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i8 s3, s4 -; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_perm_b32 v3, s3, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_and_b64 vcc, exec, -1 -; GFX9-DL-NEXT: .LBB17_1: ; %.lr.ph -; GFX9-DL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-DL-NEXT: global_load_ubyte v4, v2, s[0:1] offset:3 -; GFX9-DL-NEXT: global_load_ubyte v5, v2, s[0:1] offset:4 -; GFX9-DL-NEXT: global_load_ubyte v6, v2, s[0:1] offset:2 -; GFX9-DL-NEXT: global_load_ubyte v7, v2, s[0:1] offset:1 -; GFX9-DL-NEXT: global_load_ubyte v8, v2, s[0:1] -; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX9-DL-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v5, v8, v5, s2 -; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX9-DL-NEXT: v_dot4_i32_i8 v4, v5, v3, v4 -; GFX9-DL-NEXT: global_store_dword v[0:1], v4, off -; GFX9-DL-NEXT: s_mov_b64 vcc, vcc -; GFX9-DL-NEXT: s_cbranch_vccnz .LBB17_1 -; GFX9-DL-NEXT: ; %bb.2: ; %DummyReturnBlock -; GFX9-DL-NEXT: s_endpgm -; -; GFX10-DL-LABEL: ByteOffsetCorrectness: -; GFX10-DL: ; %bb.0: ; %.entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xc0c0400 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 vcc_lo, exec_lo -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_sext_i32_i8 s2, s4 -; GFX10-DL-NEXT: s_bfe_i32 s3, s4, 0x80008 -; GFX10-DL-NEXT: v_perm_b32 v3, s2, s3, v3 -; GFX10-DL-NEXT: .LBB17_1: ; %.lr.ph -; GFX10-DL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-DL-NEXT: s_clause 0x4 -; GFX10-DL-NEXT: global_load_ubyte v4, v2, s[0:1] offset:3 -; GFX10-DL-NEXT: global_load_ubyte v5, v2, s[0:1] offset:4 -; GFX10-DL-NEXT: global_load_ubyte v6, v2, s[0:1] offset:2 -; GFX10-DL-NEXT: global_load_ubyte v7, v2, s[0:1] offset:1 -; GFX10-DL-NEXT: global_load_ubyte v8, v2, s[0:1] -; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_perm_b32 v5, v8, v5, 0xc0c0400 -; GFX10-DL-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX10-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v5, v3 -; GFX10-DL-NEXT: global_store_dword v[0:1], v4, off -; GFX10-DL-NEXT: s_cbranch_vccnz .LBB17_1 -; GFX10-DL-NEXT: ; %bb.2: ; %DummyReturnBlock -; GFX10-DL-NEXT: s_endpgm -; -; GFX11-DL-LABEL: ByteOffsetCorrectness: -; GFX11-DL: ; %bb.0: ; %.entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0xc0c0400 :: v_dual_mov_b32 v2, 0 -; GFX11-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-DL-NEXT: s_mov_b32 vcc_lo, exec_lo -; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: s_sext_i32_i8 s2, s4 -; GFX11-DL-NEXT: s_bfe_i32 s3, s4, 0x80008 -; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-DL-NEXT: v_perm_b32 v3, s2, s3, v3 -; GFX11-DL-NEXT: .p2align 6 -; GFX11-DL-NEXT: .LBB17_1: ; %.lr.ph -; GFX11-DL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-DL-NEXT: s_clause 0x4 -; GFX11-DL-NEXT: global_load_u8 v4, v2, s[0:1] offset:3 -; GFX11-DL-NEXT: global_load_u8 v5, v2, s[0:1] offset:4 -; GFX11-DL-NEXT: global_load_u8 v6, v2, s[0:1] offset:2 -; GFX11-DL-NEXT: global_load_u8 v7, v2, s[0:1] offset:1 -; GFX11-DL-NEXT: global_load_u8 v8, v2, s[0:1] -; GFX11-DL-NEXT: s_waitcnt vmcnt(1) -; GFX11-DL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX11-DL-NEXT: s_waitcnt vmcnt(0) -; GFX11-DL-NEXT: v_perm_b32 v5, v8, v5, 0xc0c0400 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-DL-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX11-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_dot4_i32_iu8 v4, v5, v3, v4 neg_lo:[1,1,0] -; GFX11-DL-NEXT: global_store_b32 v[0:1], v4, off -; GFX11-DL-NEXT: s_cbranch_vccnz .LBB17_1 -; GFX11-DL-NEXT: ; %bb.2: ; %DummyReturnBlock -; GFX11-DL-NEXT: s_endpgm -.entry: - br label %.lr.ph - -.lr.ph: ; preds = %.lr.ph, %.entry - %l80 = load i8, ptr addrspace(1) %inptr1, align 1 - %op80 = sext i8 %l80 to i32 - %op81 = sext i8 %l81 to i32 - %mul8 = mul i32 %op80, %op81 - %gep50 = getelementptr i8, ptr addrspace(1) %inptr1, i64 4 - %l50 = load i8, ptr addrspace(1) %gep50, align 1 - %op50 = sext i8 %l50 to i32 - %op51 = sext i8 %l51 to i32 - %mul5 = mul i32 %op50, %op51 - %gep40 = getelementptr i8, ptr addrspace(1) %inptr1, i64 3 - %l40 = load i8, ptr addrspace(1) %gep40, align 1 - %gep30 = getelementptr i8, ptr addrspace(1) %inptr1, i64 2 - %l30 = load i8, ptr addrspace(1) %gep30, align 1 - %gep20 = getelementptr i8, ptr addrspace(1) %inptr1, i64 1 - %l20 = load i8, ptr addrspace(1) %gep20, align 1 - %ivadd31 = or i8 %l30, %l20 - %ivadd42 = or i8 %l40, %ivadd31 - %ivadd4 = sext i8 %ivadd42 to i32 - %ivadd5 = or i32 %mul5, %ivadd4 - %ivadd8 = add i32 %mul8, %ivadd5 - store i32 %ivadd8, ptr addrspace(1) null, align 4 - br label %.lr.ph -} - declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index e146cea50fa451..23e19bbe97153b 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -10,18 +10,18 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -44,19 +44,19 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 @@ -66,26 +66,26 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -93,60 +93,60 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s2, v4 +; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc32: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -194,18 +194,18 @@ entry: define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -228,21 +228,21 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v3 @@ -266,15 +266,15 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -285,64 +285,64 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ushort v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_short v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX10-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc16: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: global_load_u16 v3, v1, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: global_load_u16 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 -; GFX11-DL-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b16 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -390,18 +390,18 @@ entry: define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -424,20 +424,20 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 @@ -457,14 +457,14 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 @@ -478,57 +478,57 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc8: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 -; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -568,20 +568,20 @@ entry: define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 @@ -591,25 +591,25 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3 @@ -623,14 +623,14 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -638,64 +638,64 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2 -; GFX9-NODL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] +; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot2_8: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: global_load_u8 v3, v2, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: global_load_u8 v3, v2, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(2) ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) @@ -703,7 +703,7 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3 -; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b8 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -732,18 +732,18 @@ entry: define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -766,20 +766,20 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 @@ -799,14 +799,14 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 @@ -820,57 +820,57 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_CommutationInsideMAD: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3 -; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -910,18 +910,18 @@ entry: define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_CommutationAccrossMADs: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -944,20 +944,20 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_CommutationAccrossMADs: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 @@ -977,14 +977,14 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -998,57 +998,57 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3 -; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1088,18 +1088,18 @@ entry: define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_multiuse_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -1123,19 +1123,19 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_multiuse_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 @@ -1145,27 +1145,27 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s2 +; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s0 ; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v8 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -1176,77 +1176,77 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v2, v0 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_multiuse_mul1: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, v3, s2 +; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, v3, s0 ; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2 -; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b32 v3, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1295,18 +1295,18 @@ entry: define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_multiuse_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -1331,19 +1331,19 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_multiuse_add1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 @@ -1353,28 +1353,28 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s2 +; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s0 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v4 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_multiuse_add1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8 @@ -1384,78 +1384,78 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s2 -; GFX9-NODL-NEXT: v_add_u32_e32 v4, s2, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s0 +; GFX9-NODL-NEXT: v_add_u32_e32 v4, s0, v2 ; GFX9-NODL-NEXT: v_add3_u32 v2, v2, v3, v6 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v1, v4 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_add1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_add_i32 s3, s2, s2 +; GFX9-DL-NEXT: s_add_i32 s1, s0, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX9-DL-NEXT: v_add3_u32 v1, s3, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: v_add3_u32 v1, s1, v3, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_add1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX10-DL-NEXT: s_add_i32 s2, s2, s2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX10-DL-NEXT: s_add_i32 s0, s0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, s2, v0, v1 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_multiuse_add1: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_bfe_u32 v2, v1, 8, 8 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 ; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-DL-NEXT: s_add_i32 s2, s2, s2 +; GFX11-DL-NEXT: s_add_i32 s0, s0, s0 ; GFX11-DL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_add3_u32 v0, s2, v2, v0 -; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: v_add3_u32 v0, s0, v2, v0 +; GFX11-DL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1504,18 +1504,18 @@ entry: define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX7-LABEL: notdot4_mixedtypes: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -1540,21 +1540,21 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notdot4_mixedtypes: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 @@ -1578,15 +1578,15 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notdot4_mixedtypes: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1597,27 +1597,27 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v5, v2, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0302 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0302 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1628,25 +1628,25 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_bfe_i32 v5, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1662,21 +1662,20 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: notdot4_mixedtypes: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) @@ -1686,7 +1685,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_bfe_i32 v7, v0, 0, 8 ; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[4:5] ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0302 ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0302 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) @@ -1696,7 +1695,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3 -; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1745,18 +1744,18 @@ entry: define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX7-LABEL: notdot4_mixedtypes2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -1783,21 +1782,21 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notdot4_mixedtypes2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 @@ -1823,15 +1822,15 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notdot4_mixedtypes2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1843,7 +1842,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v8, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v5, v6, v3 @@ -1851,20 +1850,20 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1876,7 +1875,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v8, v3 -; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v5, v6, v3 @@ -1884,22 +1883,22 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1919,27 +1918,26 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: notdot4_mixedtypes2: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_and_b32_e32 v9, 0xff, v0 -; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[4:5] ; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0 ; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 @@ -1958,7 +1956,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 ; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v3 -; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2007,18 +2005,18 @@ entry: define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -2041,19 +2039,19 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8 @@ -2065,24 +2063,24 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v7, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v6, v0 ; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -2090,60 +2088,60 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s2, v4 +; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc32_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2177,18 +2175,18 @@ entry: define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -2215,21 +2213,21 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 @@ -2251,16 +2249,16 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff -; GFX9-NODL-NEXT: s_mov_b32 s3, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_mov_b32 s1, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -2268,13 +2266,13 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-NODL-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xff, v2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s3 -; GFX9-NODL-NEXT: v_perm_b32 v1, v5, v1, s3 -; GFX9-NODL-NEXT: v_perm_b32 v5, v6, v9, s3 -; GFX9-NODL-NEXT: v_perm_b32 v4, v4, v8, s3 +; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s1 +; GFX9-NODL-NEXT: v_perm_b32 v1, v5, v1, s1 +; GFX9-NODL-NEXT: v_perm_b32 v5, v6, v9, s1 +; GFX9-NODL-NEXT: v_perm_b32 v4, v4, v8, s1 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) @@ -2282,21 +2280,21 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff -; GFX9-DL-NEXT: s_mov_b32 s3, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: s_mov_b32 s1, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -2304,13 +2302,13 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xff, v2 -; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s3 -; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s3 -; GFX9-DL-NEXT: v_perm_b32 v5, v6, v9, s3 -; GFX9-DL-NEXT: v_perm_b32 v4, v4, v8, s3 +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v9, s1 +; GFX9-DL-NEXT: v_perm_b32 v4, v4, v8, s1 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -2318,22 +2316,22 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b16 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -2357,24 +2355,23 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc16_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v7, 0xff, v1 -; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[4:5] ; GFX11-DL-NEXT: v_lshrrev_b16 v4, 8, v1 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_lshrrev_b16 v5, 8, v0 @@ -2403,7 +2400,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3 -; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2437,18 +2434,18 @@ entry: define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -2471,20 +2468,20 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 @@ -2508,14 +2505,14 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc8_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -2531,19 +2528,19 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -2559,21 +2556,21 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -2596,23 +2593,22 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5 ; GFX10-DL-NEXT: v_mad_u16 v1, v7, v8, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 -; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc8_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: global_load_u8 v3, v2, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: global_load_u8 v3, v2, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(2) ; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -2644,7 +2640,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DL-NEXT: v_mad_u16 v0, v4, v7, v0 ; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b8 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2674,20 +2670,20 @@ entry: define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_2ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 @@ -2695,26 +2691,26 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s0 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc32_2ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8 @@ -2722,94 +2718,94 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_2ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_2ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0100 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_2ele: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_2ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2842,18 +2838,18 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -2873,19 +2869,19 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 @@ -2894,24 +2890,24 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 ; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -2920,74 +2916,74 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -3027,18 +3023,18 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele_permuted: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -3058,19 +3054,19 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele_permuted: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3 @@ -3079,24 +3075,24 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 ; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 24, v1 @@ -3105,74 +3101,74 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020003 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020003 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020003 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020003 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020003 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020003 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -3213,18 +3209,18 @@ entry: define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_opt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 @@ -3245,20 +3241,20 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_opt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 8 @@ -3276,12 +3272,12 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_opt: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -3292,55 +3288,54 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_opt: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_opt: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 -; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_opt: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -3385,7 +3380,7 @@ entry: define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_3src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -3422,7 +3417,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_3src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3460,13 +3455,13 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] -; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[12:13] +; GFX9-NODL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -3478,20 +3473,20 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_add3_u32 v2, v4, s0, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_3src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0x706010c ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0c00 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX9-DL-NEXT: s_load_dword s1, s[14:15], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 @@ -3500,19 +3495,19 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v3, v1, s1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_3src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[10:11] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[12:13] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0x706010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -3521,12 +3516,12 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v0, s0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[10:11] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[14:15] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc32_3src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3594,7 +3589,7 @@ entry: define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_3src_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -3628,7 +3623,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_3src_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3663,13 +3658,13 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] -; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[12:13] +; GFX9-NODL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 @@ -3680,21 +3675,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_3src_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c ; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c00 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX9-DL-NEXT: s_load_dword s3, s[14:15], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 @@ -3704,19 +3699,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_3src_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[10:11] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[12:13] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -3726,12 +3721,12 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc020100 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[10:11] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[14:15] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc32_3src_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3794,55 +3789,56 @@ entry: define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_bad_source: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0xf -; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: s_mov_b32 s14, 0 -; GFX7-NEXT: s_mov_b32 s15, s11 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dword s12, s[4:5], 0xf +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x11 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 +; GFX7-NEXT: s_and_b32 s1, s12, 0xffff +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v3, s0, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, s1, v1 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v5, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_bad_source: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; GFX8-NEXT: s_and_b32 s3, s6, 0xffff +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_and_b32 s3, s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_waitcnt vmcnt(1) @@ -3862,15 +3858,16 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_bad_source: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dword s6, s[4:5], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_and_b32 s3, s8, 0xffff +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; GFX9-NODL-NEXT: s_and_b32 s3, s6, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -3886,16 +3883,17 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_bad_source: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dword s6, s[4:5], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0201 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_and_b32 s4, s8, 0xffff +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0201 +; GFX9-DL-NEXT: s_and_b32 s4, s6, 0xffff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 @@ -3912,16 +3910,18 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX10-DL-LABEL: udot4_bad_source: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x3c ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_and_b32 s3, s8, 0xffff +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; GFX10-DL-NEXT: s_and_b32 s3, s6, 0xffff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 @@ -3937,17 +3937,18 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX11-DL-LABEL: udot4_bad_source: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b32 s8, s[2:3], 0x3c +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DL-NEXT: s_load_b32 s6, s[4:5], 0x3c ; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_and_b32 s3, s8, 0xffff +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX11-DL-NEXT: s_and_b32 s3, s6, 0xffff +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1 @@ -4005,18 +4006,18 @@ entry: define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_commutative: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xf ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -4036,19 +4037,19 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_commutative: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 @@ -4057,24 +4058,24 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 ; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_commutative: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -4083,74 +4084,74 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_commutative: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_commutative: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_commutative: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x3c ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) %src3, @@ -4195,7 +4196,7 @@ entry: define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_3src_3ele_src0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -4228,7 +4229,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_3src_3ele_src0: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -4262,13 +4263,13 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[8:9] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] +; GFX9-NODL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8 @@ -4279,21 +4280,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c ; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[12:13] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[10:11] +; GFX9-DL-NEXT: s_load_dword s3, s[14:15], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0 @@ -4303,19 +4304,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[14:15] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[10:11] +; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -4325,12 +4326,12 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc020101 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[10:11] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[14:15] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4393,25 +4394,25 @@ entry: define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_4src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GFX7-NEXT: s_mov_b32 s15, 0xf000 +; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s18, 0 -; GFX7-NEXT: s_mov_b32 s19, s15 +; GFX7-NEXT: s_mov_b32 s19, s3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[6:7] +; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11] ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9] +; GFX7-NEXT: s_mov_b64 s[16:17], s[12:13] ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11] +; GFX7-NEXT: s_mov_b64 s[16:17], s[14:15] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x11 -; GFX7-NEXT: s_mov_b32 s14, -1 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[12:13], 0x0 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 @@ -4419,7 +4420,7 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s0 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 8, 8 @@ -4429,29 +4430,29 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v2, v4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_4src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s12, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -4479,14 +4480,14 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_4src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] -; GFX9-NODL-NEXT: global_load_dword v4, v0, s[10:11] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[12:13] +; GFX9-NODL-NEXT: global_load_dword v4, v0, s[14:15] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(3) @@ -4505,16 +4506,16 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_4src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501 ; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9] -; GFX9-DL-NEXT: global_load_dword v4, v0, s[10:11] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[12:13] +; GFX9-DL-NEXT: global_load_dword v4, v0, s[14:15] ; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0400 ; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s5, 0x4000c0c @@ -4534,15 +4535,15 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_4src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x3 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9] -; GFX10-DL-NEXT: global_load_dword v4, v0, s[10:11] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[12:13] +; GFX10-DL-NEXT: global_load_dword v4, v0, s[14:15] ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0501 @@ -4560,17 +4561,17 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_4src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX11-DL-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x3 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[6:7] -; GFX11-DL-NEXT: global_load_b32 v3, v0, s[8:9] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[10:11] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[8:9] +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[10:11] +; GFX11-DL-NEXT: global_load_b32 v3, v0, s[12:13] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[14:15] ; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(2) ; GFX11-DL-NEXT: v_perm_b32 v4, v2, v1, 0xc0c0501 @@ -4642,19 +4643,19 @@ entry: define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_multi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] -; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[10:11] +; GFX7-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -4685,19 +4686,19 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_multi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[2:3] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -4705,7 +4706,7 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v3, v3, v4, s2 +; GFX8-NEXT: v_mad_u32_u24 v3, v3, v4, s0 ; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v1 ; GFX8-NEXT: v_mad_u32_u24 v3, v7, v8, v3 ; GFX8-NEXT: v_bfe_u32 v11, v1, 16, 8 @@ -4721,20 +4722,20 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX8-NEXT: v_mad_u32_u24 v0, v10, v6, v0 ; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc32_multi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v3, v2, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -4746,76 +4747,76 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v9, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v3, v4, s2, v6 +; GFX9-NODL-NEXT: v_add3_u32 v3, v4, s0, v6 ; GFX9-NODL-NEXT: v_add3_u32 v3, v3, v7, v9 ; GFX9-NODL-NEXT: v_add3_u32 v0, v5, v3, v0 ; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v8, v1 -; GFX9-NODL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_multi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x6040200 -; GFX9-DL-NEXT: s_mov_b32 s3, 0x2000200 +; GFX9-DL-NEXT: s_mov_b32 s4, 0x3010301 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] -; GFX9-DL-NEXT: global_load_dword v3, v2, s[6:7] -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s4, 0x7050301 -; GFX9-DL-NEXT: s_mov_b32 s6, 0x3010301 +; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-DL-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x6040200 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x2000200 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x7050301 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v4, v1, v0, s2 +; GFX9-DL-NEXT: v_perm_b32 v4, v1, v0, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v5, v3, v3, s3 -; GFX9-DL-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-DL-NEXT: v_perm_b32 v5, v3, v3, s1 +; GFX9-DL-NEXT: v_perm_b32 v0, v1, v0, s2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v4, v5, s5 -; GFX9-DL-NEXT: v_perm_b32 v3, v3, v3, s6 +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v4, v5, s3 +; GFX9-DL-NEXT: v_perm_b32 v3, v3, v3, s4 ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v3, v1 -; GFX9-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_multi: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v2, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX10-DL-NEXT: global_load_dword v3, v2, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v2, v1, v0, 0x6040200 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v4, v3, v3, 0x2000200 ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v0, 0x7050301 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v4, s2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v4, s0 ; GFX10-DL-NEXT: v_perm_b32 v2, v3, v3, 0x3010301 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v1 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc32_multi: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b64 v[0:1], v2, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v2, v2, s[6:7] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b64 v[0:1], v2, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v2, v2, s[2:3] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v3, v1, v0, 0x6040200 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) @@ -4824,10 +4825,10 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_perm_b32 v2, v2, v2, 0x3010301 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-DL-NEXT: v_dot4_u32_u8 v1, v3, v4, s2 +; GFX11-DL-NEXT: v_dot4_u32_u8 v1, v3, v4, s0 ; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v1 -; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b32 v3, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -4903,18 +4904,18 @@ entry: define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_hilo: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:4 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 @@ -4935,22 +4936,22 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_hilo: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 @@ -4968,12 +4969,12 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_hilo: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -4984,55 +4985,54 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_hilo: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_hilo: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 -; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_hilo: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] offset:4 -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] offset:4 +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -5078,18 +5078,18 @@ entry: define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_lohi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 @@ -5110,22 +5110,22 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_lohi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 @@ -5143,12 +5143,12 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_lohi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -5159,67 +5159,66 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_lohi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x10302 -; GFX9-DL-NEXT: s_mov_b32 s3, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] offset:4 +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s0, 0x10302 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_lohi: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] offset:4 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0x10302 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3020001 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_lohi: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4 -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] offset:4 +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x10302 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -5265,18 +5264,18 @@ entry: define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_hihi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:4 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_u32 v3, v2, 16, 8 @@ -5297,15 +5296,15 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_hihi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -5313,8 +5312,8 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX8-NEXT: v_bfe_u32 v7, v2, 8, 8 @@ -5332,12 +5331,12 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_hihi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -5348,67 +5347,66 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_hihi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x1030200 -; GFX9-DL-NEXT: s_mov_b32 s3, 0x3010002 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 -; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] offset:4 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] offset:4 +; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] offset:4 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x1030200 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x3010002 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_hihi: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] offset:4 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] offset:4 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] offset:4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0x1030200 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3010002 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_hihi: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4 -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:4 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] offset:4 +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] offset:4 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x1030200 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3010002 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -5454,8 +5452,8 @@ entry: define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_v8i8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, s7 @@ -5482,8 +5480,8 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_v8i8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -5508,8 +5506,8 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_v8i8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) @@ -5522,41 +5520,42 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v0, v1, v6, v0 -; GFX9-NODL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NODL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_v8i8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 -; GFX9-DL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_v8i8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_v8i8: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[2:3], s[4:5], 0x34 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -5609,21 +5608,21 @@ entry: define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_v16i8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX7-NEXT: s_mov_b64 s[8:9], s[10:11] +; GFX7-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_dword v0, v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[4:5], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 @@ -5644,22 +5643,22 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_v16i8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[1:2] ; GFX8-NEXT: flat_load_dword v4, v[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v4 @@ -5676,16 +5675,16 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_v16i8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX9-NODL-NEXT: ; kill: killed $vgpr5 ; GFX9-NODL-NEXT: ; kill: killed $vgpr4 -; GFX9-NODL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 +; GFX9-NODL-NEXT: ; kill: killed $vgpr5 +; GFX9-NODL-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2 killed $sgpr3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v0, v5, s[6:7] +; GFX9-NODL-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v0, v5, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2 @@ -5696,66 +5695,65 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-NODL-NEXT: v_add3_u32 v0, v2, v6, v0 -; GFX9-NODL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_v16i8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x7050002 +; GFX9-DL-NEXT: ; kill: killed $vgpr4 +; GFX9-DL-NEXT: ; kill: killed $vgpr5 +; GFX9-DL-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2 killed $sgpr3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] -; GFX9-DL-NEXT: global_load_dword v0, v5, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s3, 0x3020001 +; GFX9-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX9-DL-NEXT: global_load_dword v0, v5, s[2:3] +; GFX9-DL-NEXT: s_mov_b32 s0, 0x7050002 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 -; GFX9-DL-NEXT: ; kill: killed $vgpr5 -; GFX9-DL-NEXT: ; kill: killed $vgpr4 -; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s3 +; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s1 ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, 0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_v16i8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 -; GFX10-DL-NEXT: ; kill: killed $vgpr5 ; GFX10-DL-NEXT: ; kill: killed $vgpr4 +; GFX10-DL-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2 killed $sgpr3 +; GFX10-DL-NEXT: ; kill: killed $vgpr5 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] -; GFX10-DL-NEXT: global_load_dword v0, v5, s[6:7] +; GFX10-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX10-DL-NEXT: global_load_dword v0, v5, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v1, v3, v2, 0x7050002 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_v16i8: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v4, s[6:7] +; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v4, s[2:3] ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v1, v3, v2, 0x7050002 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 @@ -5763,7 +5761,7 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -5809,20 +5807,20 @@ entry: define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_v256i8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 offset:252 -; GFX7-NEXT: buffer_load_dword v1, v[3:4], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[8:11], 0 addr64 offset:252 +; GFX7-NEXT: buffer_load_dword v1, v[3:4], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_u32 v4, v0, 16, 8 @@ -5843,24 +5841,24 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_v256i8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xfc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s0, v1 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0xfc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 @@ -5878,13 +5876,13 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_v256i8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v2, v1, s[4:5] offset:252 -; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NODL-NEXT: global_load_dword v2, v1, s[0:1] offset:252 +; GFX9-NODL-NEXT: global_load_dword v3, v0, s[2:3] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v2 @@ -5895,67 +5893,66 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v6, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_v256i8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: global_load_dword v3, v1, s[4:5] offset:252 -; GFX9-DL-NEXT: s_mov_b32 s3, 0x1000302 +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v3, v1, s[0:1] offset:252 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x3020001 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x1000302 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v2, v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_v256i8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: global_load_dword v2, v1, s[6:7] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] offset:252 +; GFX10-DL-NEXT: global_load_dword v2, v1, s[2:3] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] offset:252 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v0, v2, v2, 0x3020001 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0x1000302 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_v256i8: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: global_load_b32 v1, v1, s[6:7] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:252 +; GFX11-DL-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] offset:252 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x3020001 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x1000302 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -6001,139 +5998,139 @@ entry: define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_anyext: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v1, s0 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc32_anyext: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s2 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_anyext: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_anyext: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0500 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0100 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0500 +; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0100 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v1, s3 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_anyext: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0500 ; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_anyext: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0500 ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index 99bb4d50b03d4c..25aa623295fe16 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -12,22 +12,22 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -63,24 +63,24 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 @@ -93,7 +93,7 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4 ; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 ; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4 ; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 ; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4 @@ -109,25 +109,25 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0 ; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1 ; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -154,81 +154,81 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v9, v10 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v2, v3, s2, v4 +; GFX9-NEXT: v_add3_u32 v2, v3, s0, v4 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v11, v12 ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v13, v14 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v6 ; GFX9-NEXT: v_mul_i32_i24_e32 v9, v15, v16 ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v8 ; GFX9-NEXT: v_add3_u32 v1, v2, v9, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc32: ; GFX10-DL-XNACK: ; %bb.0: ; %entry ; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-XNACK-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 -; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 +; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-XNACK-NEXT: s_endpgm ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc32: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] -; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2 -; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0 +; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -307,22 +307,22 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -374,27 +374,27 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 @@ -454,20 +454,20 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 @@ -527,20 +527,20 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 +; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 @@ -600,20 +600,20 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX10-DL-XNACK-LABEL: idot8_acc16: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[10:11] ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) @@ -678,21 +678,21 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc16: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -829,22 +829,22 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc8: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -896,27 +896,27 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 @@ -976,20 +976,20 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 @@ -1049,20 +1049,20 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 +; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 @@ -1122,20 +1122,20 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX10-DL-XNACK-LABEL: idot8_acc8: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[10:11] ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) @@ -1200,21 +1200,21 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc8: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11] ; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -1352,22 +1352,22 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -1405,24 +1405,24 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 @@ -1434,7 +1434,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s0 ; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4 ; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v16 ; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4 @@ -1453,25 +1453,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1 ; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v16, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1494,7 +1494,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v6 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v7, v8 ; GFX9-NEXT: v_mad_i32_i24 v3, v3, v4, v2 @@ -1506,23 +1506,23 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8 ; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10 ; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1545,7 +1545,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v5, v6 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v7, v8 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, v2 @@ -1557,26 +1557,27 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v7, v8 ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v9, v10 ; GFX9-DL-NEXT: v_add3_u32 v1, v3, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1: ; GFX10-DL-XNACK: ; %bb.0: ; %entry ; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-XNACK-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v1, 0, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 4, 4 @@ -1590,7 +1591,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 12, 4 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s0 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 @@ -1610,26 +1611,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5 -; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-XNACK-NEXT: s_endpgm ; ; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] -; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v1, 0, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 4, 4 @@ -1643,7 +1643,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 12, 4 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2 +; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s0 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 @@ -1663,7 +1663,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5 -; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1743,22 +1743,22 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -1794,24 +1794,24 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3 @@ -1832,7 +1832,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_i32 v15, v0, 4, 4 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s0 ; GFX8-NEXT: v_mad_i32_i24 v0, v8, v15, v0 ; GFX8-NEXT: v_mad_i32_i24 v0, v7, v14, v0 ; GFX8-NEXT: v_mad_i32_i24 v0, v6, v13, v0 @@ -1840,25 +1840,25 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mad_i32_i24 v0, v4, v11, v0 ; GFX8-NEXT: v_mad_i32_i24 v0, v2, v10, v0 ; GFX8-NEXT: v_mad_i32_i24 v2, v1, v9, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1884,7 +1884,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v7, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s2, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v6, v13 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v12 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 @@ -1892,74 +1892,74 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v10 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry ; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-XNACK-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 -; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 +; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-XNACK-NEXT: s_endpgm ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] -; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2 -; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0 +; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2002,22 +2002,22 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -2069,27 +2069,27 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 @@ -2149,21 +2149,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v1 @@ -2235,21 +2235,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 +; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v1 @@ -2321,20 +2321,20 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[10:11] ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) @@ -2415,21 +2415,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 @@ -2547,22 +2547,22 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -2614,27 +2614,27 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3 @@ -2714,20 +2714,20 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 @@ -2806,20 +2806,20 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s14, -1 +; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 +; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 @@ -2898,21 +2898,21 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s11 +; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[10:11] ; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 @@ -2999,21 +2999,21 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s11 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11] ; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 779107cc40e1fb..d8491f322e69a0 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -10,22 +10,22 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -61,24 +61,24 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 @@ -99,7 +99,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0 ; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0 @@ -107,25 +107,25 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0 ; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -151,7 +151,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s2, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 @@ -159,51 +159,52 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 -; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -282,22 +283,22 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -333,26 +334,26 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 @@ -386,19 +387,19 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -427,24 +428,24 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -473,27 +474,27 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -521,7 +522,7 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -600,22 +601,22 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc8: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -651,26 +652,26 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 @@ -704,19 +705,19 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -745,24 +746,24 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -791,27 +792,27 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -839,7 +840,7 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -918,22 +919,22 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc4: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -970,26 +971,26 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc4: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 @@ -1024,19 +1025,19 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc4: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -1066,24 +1067,24 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -1113,27 +1114,27 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1162,7 +1163,7 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1225,22 +1226,22 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -1277,26 +1278,26 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 @@ -1331,19 +1332,19 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_CommutationInsideMAD: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -1373,24 +1374,24 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -1420,27 +1421,27 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1469,7 +1470,7 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1530,22 +1531,22 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -1583,24 +1584,24 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 @@ -1621,7 +1622,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s2 +; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s0 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v16 ; GFX8-NEXT: v_mad_u32_u24 v3, v8, v15, v16 ; GFX8-NEXT: v_mad_u32_u24 v3, v7, v14, v3 @@ -1631,25 +1632,25 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v10, v3 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v9, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1672,7 +1673,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v17, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s2 +; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s0 ; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-NEXT: v_mad_u32_u24 v2, v3, v10, v1 @@ -1684,23 +1685,23 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4 ; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1723,7 +1724,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v10, v1 @@ -1735,26 +1736,27 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4 ; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -1770,7 +1772,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s0 ; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v11 @@ -1788,7 +1790,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0 -; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1867,22 +1869,22 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -1918,24 +1920,24 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 @@ -1956,7 +1958,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0 ; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0 @@ -1964,25 +1966,25 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0 ; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -2008,7 +2010,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s2, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 @@ -2016,51 +2018,52 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 -; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2104,22 +2107,22 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -2155,26 +2158,26 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 @@ -2208,20 +2211,20 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 @@ -2231,16 +2234,16 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s2 -; GFX9-NEXT: v_perm_b32 v7, v12, v11, s2 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 ; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s2 -; GFX9-NEXT: v_perm_b32 v9, v14, v13, s2 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2251,9 +2254,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v2, v2, v17, s2 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s2 -; GFX9-NEXT: v_perm_b32 v10, v16, v15, s2 +; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2262,25 +2265,25 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 @@ -2290,16 +2293,16 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s2 -; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s2 -; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s2 -; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s2 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2310,9 +2313,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s2 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s2 -; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2321,27 +2324,27 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -2385,7 +2388,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2428,22 +2431,22 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -2479,26 +2482,26 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3 @@ -2552,19 +2555,19 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v4, v3, s[0:1] ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NEXT: global_load_ubyte v4, v3, s[0:1] ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4 @@ -2617,19 +2620,19 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[0:1] ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 +; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[0:1] ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 @@ -2683,21 +2686,21 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -2748,7 +2751,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX10-DL-NEXT: v_mad_u16 v0, v10, v16, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v4, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2791,22 +2794,22 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc4_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s11 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 @@ -2843,26 +2846,26 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc4_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 +; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 @@ -2897,20 +2900,20 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc4_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 @@ -2920,16 +2923,16 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s2 -; GFX9-NEXT: v_perm_b32 v7, v12, v11, s2 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 ; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s2 -; GFX9-NEXT: v_perm_b32 v9, v14, v13, s2 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2940,9 +2943,9 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v2, v2, v17, s2 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s2 -; GFX9-NEXT: v_perm_b32 v10, v16, v15, s2 +; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2952,25 +2955,25 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 @@ -2980,16 +2983,16 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s2 -; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s2 -; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s2 -; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s2 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -3000,9 +3003,9 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s2 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s2 -; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -3012,27 +3015,27 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -3077,7 +3080,7 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -3115,18 +3118,18 @@ entry: define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX7-LABEL: udot8_variant1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) @@ -3161,19 +3164,19 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; ; GFX8-LABEL: udot8_variant1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 15, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 4, 4 @@ -3193,7 +3196,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v1 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v4, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v7, v6, v0 @@ -3201,20 +3204,20 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX8-NEXT: v_mad_u32_u24 v0, v11, v10, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v13, v12, v0 ; GFX8-NEXT: v_mad_u32_u24 v2, v15, v14, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_variant1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v3, 15, v1 @@ -3239,7 +3242,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v6, v5 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v8, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v3, s2, v1 +; GFX9-NEXT: v_add3_u32 v1, v3, s0, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v10, v9 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v12, v11 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v5 @@ -3247,39 +3250,40 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX9-NEXT: v_mul_u32_u24_e32 v9, v16, v15 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v7 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v9 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_variant1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2 -; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %v2addr, ptr addrspace(1) %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll index 0f40d010e2a3a9..a328bbe8b4ddc1 100644 --- a/llvm/test/CodeGen/AMDGPU/imm.ll +++ b/llvm/test/CodeGen/AMDGPU/imm.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) { ; SI-LABEL: i64_imm_inline_lo: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 5 @@ -17,7 +17,7 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) { ; ; VI-LABEL: i64_imm_inline_lo: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 5 @@ -34,7 +34,7 @@ entry: define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) { ; SI-LABEL: i64_imm_inline_hi: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x12345678 @@ -45,7 +45,7 @@ define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) { ; ; VI-LABEL: i64_imm_inline_hi: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x12345678 @@ -61,7 +61,7 @@ entry: define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { ; SI-LABEL: store_imm_neg_0.0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -72,7 +72,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_imm_neg_0.0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -87,7 +87,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_neg_0.0_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -97,7 +97,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_neg_0.0_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -111,7 +111,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -121,7 +121,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -135,7 +135,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_imm_neg_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -145,7 +145,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_imm_neg_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -159,7 +159,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0.5 @@ -169,7 +169,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0.5 @@ -183,7 +183,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -0.5 @@ -193,7 +193,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -0.5 @@ -207,7 +207,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -217,7 +217,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -231,7 +231,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -1.0 @@ -241,7 +241,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -1.0 @@ -255,7 +255,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 2.0 @@ -265,7 +265,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 2.0 @@ -279,7 +279,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -2.0 @@ -289,7 +289,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -2.0 @@ -303,7 +303,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 4.0 @@ -313,7 +313,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 4.0 @@ -327,7 +327,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -4.0 @@ -337,7 +337,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -4.0 @@ -351,7 +351,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_inv_2pi_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e22f983 @@ -361,7 +361,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_inv_2pi_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0.15915494 @@ -375,7 +375,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_inv_2pi_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xbe22f983 @@ -385,7 +385,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0xbe22f983 @@ -399,7 +399,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_literal_imm_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x45800000 @@ -409,7 +409,7 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_literal_imm_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x45800000 @@ -423,23 +423,23 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, 0 +; SI-NEXT: v_add_f32_e64 v0, s6, 0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, 0 +; VI-NEXT: v_add_f32_e64 v0, s6, 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0.0 @@ -450,23 +450,23 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, 0.5 +; SI-NEXT: v_add_f32_e64 v0, s6, 0.5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, 0.5 +; VI-NEXT: v_add_f32_e64 v0, s6, 0.5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0.5 @@ -477,23 +477,23 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, -0.5 +; SI-NEXT: v_add_f32_e64 v0, s6, -0.5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, -0.5 +; VI-NEXT: v_add_f32_e64 v0, s6, -0.5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -0.5 @@ -504,23 +504,23 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s6, 1.0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s6, 1.0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 1.0 @@ -531,23 +531,23 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, -1.0 +; SI-NEXT: v_add_f32_e64 v0, s6, -1.0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, -1.0 +; VI-NEXT: v_add_f32_e64 v0, s6, -1.0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -1.0 @@ -558,23 +558,23 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, 2.0 +; SI-NEXT: v_add_f32_e64 v0, s6, 2.0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, 2.0 +; VI-NEXT: v_add_f32_e64 v0, s6, 2.0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 2.0 @@ -585,23 +585,23 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, -2.0 +; SI-NEXT: v_add_f32_e64 v0, s6, -2.0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, -2.0 +; VI-NEXT: v_add_f32_e64 v0, s6, -2.0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -2.0 @@ -612,23 +612,23 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, 4.0 +; SI-NEXT: v_add_f32_e64 v0, s6, 4.0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, 4.0 +; VI-NEXT: v_add_f32_e64 v0, s6, 4.0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 4.0 @@ -639,23 +639,23 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, -4.0 +; SI-NEXT: v_add_f32_e64 v0, s6, -4.0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, -4.0 +; VI-NEXT: v_add_f32_e64 v0, s6, -4.0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, -4.0 @@ -666,7 +666,7 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: commute_add_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -684,7 +684,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, ; ; VI-LABEL: commute_add_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -708,7 +708,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: commute_add_literal_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -726,7 +726,7 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: commute_add_literal_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -750,23 +750,23 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, 1 +; SI-NEXT: v_add_f32_e64 v0, s6, 1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, 1 +; VI-NEXT: v_add_f32_e64 v0, s6, 1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36a0000000000000 @@ -777,23 +777,23 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_2_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, 2 +; SI-NEXT: v_add_f32_e64 v0, s6, 2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, 2 +; VI-NEXT: v_add_f32_e64 v0, s6, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36b0000000000000 @@ -804,23 +804,23 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, 16 +; SI-NEXT: v_add_f32_e64 v0, s6, 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, 16 +; VI-NEXT: v_add_f32_e64 v0, s6, 16 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36e0000000000000 @@ -831,24 +831,24 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s4, s4, -1 +; SI-NEXT: s_add_i32 s4, s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s4, s4, -1 +; VI-NEXT: s_add_i32 s4, s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -862,24 +862,24 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_2_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s4, s4, -2 +; SI-NEXT: s_add_i32 s4, s6, -2 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s4, s4, -2 +; VI-NEXT: s_add_i32 s4, s6, -2 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -893,24 +893,24 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s4, s4, -16 +; SI-NEXT: s_add_i32 s4, s6, -16 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s4, s4, -16 +; VI-NEXT: s_add_i32 s4, s6, -16 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -924,23 +924,23 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_63_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, 63 +; SI-NEXT: v_add_f32_e64 v0, s6, 63 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_63_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, 63 +; VI-NEXT: v_add_f32_e64 v0, s6, 63 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x36ff800000000000 @@ -951,23 +951,23 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_64_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, s4, 64 +; SI-NEXT: v_add_f32_e64 v0, s6, 64 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_64_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v0, s4, 64 +; VI-NEXT: v_add_f32_e64 v0, s6, 64 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd float %x, 0x3700000000000000 @@ -978,25 +978,25 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_0.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_0.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0.0 @@ -1007,25 +1007,25 @@ define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 0.5 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 0.5 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0.5 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0.5 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0.5 @@ -1036,25 +1036,25 @@ define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], -0.5 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], -0.5 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -0.5 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], -0.5 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -0.5 @@ -1065,25 +1065,25 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 1.0 @@ -1094,25 +1094,25 @@ define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], -1.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], -1.0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -1.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], -1.0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -1.0 @@ -1123,25 +1123,25 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 2.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 2.0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 2.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 2.0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 2.0 @@ -1152,25 +1152,25 @@ define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], -2.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], -2.0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -2.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], -2.0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -2.0 @@ -1181,25 +1181,25 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 4.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 4.0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 4.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 4.0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 4.0 @@ -1210,25 +1210,25 @@ define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], -4.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], -4.0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -4.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], -4.0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -4.0 @@ -1239,27 +1239,27 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; SI-NEXT: v_mov_b32_e32 v1, 0x3fc45f30 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0.15915494309189532 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0.15915494309189532 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x3fc45f306dc9c882 @@ -1270,29 +1270,29 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_m_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; SI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_m_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; VI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0xbfc45f306dc9c882 @@ -1303,25 +1303,25 @@ define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], d define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000001 @@ -1332,25 +1332,25 @@ define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32] define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_2_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 2 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 2 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000002 @@ -1361,25 +1361,25 @@ define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32] define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_16_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 16 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 16 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_16_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 16 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 16 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000010 @@ -1390,7 +1390,7 @@ define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, -1 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: add_inline_imm_neg_1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, -1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1417,7 +1417,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -2 @@ -1428,7 +1428,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: add_inline_imm_neg_2_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -2 @@ -1444,7 +1444,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_16_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -16 @@ -1455,7 +1455,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: add_inline_imm_neg_16_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -16 @@ -1471,25 +1471,25 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_63_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 63 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 63 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_63_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 63 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 63 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x000000000000003F @@ -1500,25 +1500,25 @@ define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_64_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_add_f64 v[0:1], s[0:1], 64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_64_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], s[0:1], 64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000040 @@ -1529,7 +1529,7 @@ define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1540,7 +1540,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1555,7 +1555,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_literal_imm_neg_0.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) ; ; VI-LABEL: store_literal_imm_neg_0.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1581,7 +1581,7 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1607,7 +1607,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1618,7 +1618,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1644,7 +1644,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1659,7 +1659,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1670,7 +1670,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1696,7 +1696,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1711,7 +1711,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1722,7 +1722,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1737,7 +1737,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1748,7 +1748,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1763,7 +1763,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1774,7 +1774,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1789,7 +1789,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1800,7 +1800,7 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1815,7 +1815,7 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1826,7 +1826,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1841,7 +1841,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_literal_imm_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1852,7 +1852,7 @@ define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_literal_imm_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll index 02a9169c0e6fae..a2cc427bf6e548 100644 --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_neg_0.0_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -20,7 +20,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_neg_0.0_i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -31,7 +31,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_neg_0.0_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] @@ -42,7 +42,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_neg_0.0_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x8000 @@ -57,7 +57,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_0.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -67,7 +67,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_0.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -77,7 +77,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] @@ -87,7 +87,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_0.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -101,7 +101,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_imm_neg_0.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -111,7 +111,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_imm_neg_0.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -121,7 +121,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_imm_neg_0.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] @@ -131,7 +131,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_imm_neg_0.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x8000 @@ -145,7 +145,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_0.5_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -155,7 +155,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_0.5_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -165,7 +165,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] @@ -175,7 +175,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3800 @@ -189,7 +189,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_0.5_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -199,7 +199,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_0.5_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -209,7 +209,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] @@ -219,7 +219,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xb800 @@ -233,7 +233,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_1.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -243,7 +243,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_1.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -253,7 +253,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] @@ -263,7 +263,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 @@ -277,7 +277,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_1.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -287,7 +287,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_1.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -297,7 +297,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] @@ -307,7 +307,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xbc00 @@ -321,7 +321,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_2.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -331,7 +331,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_2.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -341,7 +341,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] @@ -351,7 +351,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x4000 @@ -365,7 +365,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_2.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -375,7 +375,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_2.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -385,7 +385,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] @@ -395,7 +395,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xc000 @@ -409,7 +409,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_4.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -419,7 +419,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_4.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -429,7 +429,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] @@ -439,7 +439,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x4400 @@ -453,7 +453,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_4.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -463,7 +463,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_4.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -473,7 +473,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] @@ -483,7 +483,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xc400 @@ -497,7 +497,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_inv_2pi_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -507,7 +507,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_inv_2pi_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -517,7 +517,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_inv_2pi_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] @@ -527,7 +527,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_inv_2pi_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3118 @@ -541,7 +541,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_inv_2pi_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -551,7 +551,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: store_inline_imm_m_inv_2pi_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -561,7 +561,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] @@ -571,7 +571,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; ; SI-LABEL: store_inline_imm_m_inv_2pi_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xb118 @@ -585,7 +585,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_literal_imm_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -595,7 +595,7 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_literal_imm_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -605,7 +605,7 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_literal_imm_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] @@ -615,7 +615,7 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_literal_imm_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x6c00 @@ -630,8 +630,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_0.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00] @@ -642,19 +642,19 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_0.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x00,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_0.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -664,12 +664,12 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; ; SI-LABEL: add_inline_imm_0.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -684,8 +684,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_0.5_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00] @@ -696,19 +696,19 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_0.5_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe0,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -718,12 +718,12 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; ; SI-LABEL: add_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, 0.5, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -738,8 +738,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_0.5_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00] @@ -750,19 +750,19 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_0.5_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe2,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_neg_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -772,12 +772,12 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; ; SI-LABEL: add_inline_imm_neg_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, -0.5, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -792,8 +792,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_1.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00] @@ -804,19 +804,19 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_1.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe4,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -826,12 +826,12 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; ; SI-LABEL: add_inline_imm_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -846,8 +846,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_1.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00] @@ -858,19 +858,19 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_1.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe6,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_neg_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -880,12 +880,12 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; ; SI-LABEL: add_inline_imm_neg_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -900,8 +900,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_2.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00] @@ -912,19 +912,19 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_2.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe8,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -934,12 +934,12 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; ; SI-LABEL: add_inline_imm_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -954,8 +954,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_2.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00] @@ -966,19 +966,19 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_2.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xea,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_neg_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -988,12 +988,12 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; ; SI-LABEL: add_inline_imm_neg_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, -2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1008,8 +1008,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_4.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00] @@ -1020,19 +1020,19 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_4.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xec,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -1042,12 +1042,12 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; ; SI-LABEL: add_inline_imm_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1062,8 +1062,8 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_4.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00] @@ -1074,19 +1074,19 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_4.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xee,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_neg_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -1096,12 +1096,12 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; ; SI-LABEL: add_inline_imm_neg_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1115,7 +1115,7 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: commute_add_inline_imm_0.5_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; encoding: [0x04,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; ; GFX11-LABEL: commute_add_inline_imm_0.5_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1151,7 +1151,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; ; VI-LABEL: commute_add_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; encoding: [0x04,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1169,7 +1169,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; ; SI-LABEL: commute_add_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1195,7 +1195,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: commute_add_literal_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; encoding: [0x04,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: commute_add_literal_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1231,7 +1231,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: commute_add_literal_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; encoding: [0x04,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1249,7 +1249,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; ; SI-LABEL: commute_add_literal_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1276,8 +1276,8 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_1_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00] @@ -1288,19 +1288,19 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_1_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x02,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -1310,12 +1310,12 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; ; SI-LABEL: add_inline_imm_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, 0x33800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1330,8 +1330,8 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_2_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00] @@ -1342,19 +1342,19 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_2_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x04,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -1364,12 +1364,12 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; ; SI-LABEL: add_inline_imm_2_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, 0x34000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1384,8 +1384,8 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_16_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00] @@ -1396,19 +1396,19 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_16_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x20,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_16_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -1418,12 +1418,12 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; ; SI-LABEL: add_inline_imm_16_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, 0x35800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1437,7 +1437,7 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: add_inline_imm_neg_1_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; encoding: [0x04,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1455,7 +1455,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: add_inline_imm_neg_1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1473,7 +1473,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: add_inline_imm_neg_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; encoding: [0x04,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1491,7 +1491,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; ; SI-LABEL: add_inline_imm_neg_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1516,7 +1516,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: add_inline_imm_neg_2_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; encoding: [0x04,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1534,7 +1534,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: add_inline_imm_neg_2_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: add_inline_imm_neg_2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; encoding: [0x04,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1570,7 +1570,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; ; SI-LABEL: add_inline_imm_neg_2_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1595,7 +1595,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: add_inline_imm_neg_16_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; encoding: [0x04,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1613,7 +1613,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: add_inline_imm_neg_16_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1631,7 +1631,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; ; VI-LABEL: add_inline_imm_neg_16_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; encoding: [0x04,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1649,7 +1649,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; ; SI-LABEL: add_inline_imm_neg_16_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1675,8 +1675,8 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_63_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00] @@ -1687,19 +1687,19 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_63_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x7e,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_63_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -1709,12 +1709,12 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; ; SI-LABEL: add_inline_imm_63_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, 0x367c0000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1729,8 +1729,8 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_64_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 ; encoding: [0x84,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00] @@ -1741,19 +1741,19 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_64_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x80,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; ; VI-LABEL: add_inline_imm_64_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; encoding: [0x04,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; encoding: [0x04,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] @@ -1763,12 +1763,12 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; ; SI-LABEL: add_inline_imm_64_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_add_f32_e32 v0, 0x36800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index b89dbd42e0466f..7f334e0ca21e1f 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -33,8 +33,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0xc8 ; GFX8V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 @@ -80,7 +80,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 @@ -123,8 +123,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_shared: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc -; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc +; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s2, s[6:7], 0x4 +; GFX9V5-NEXT: s_load_dword s2, s[8:9], 0x4 ; GFX9V5-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 @@ -177,8 +177,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_private: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 -; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8 +; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -201,7 +201,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s2, s[6:7], 0x4 +; GFX9V5-NEXT: s_load_dword s2, s[8:9], 0x4 ; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 @@ -224,7 +224,7 @@ define amdgpu_kernel void @llvm_trap() { ; ; GFX8V5-LABEL: llvm_trap: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_trap 2 ; @@ -287,9 +287,11 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_add_u32 s0, s6, 8 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V5-NEXT: s_add_u32 s0, s8, 8 ; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc -; GFX8V5-NEXT: s_addc_u32 s1, s7, 0 +; GFX8V5-NEXT: s_addc_u32 s1, s9, 0 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 @@ -298,9 +300,9 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V5-NEXT: v_mov_b32_e32 v2, s8 -; GFX8V5-NEXT: v_mov_b32_e32 v3, s9 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V5-NEXT: v_mov_b32_e32 v2, s10 +; GFX8V5-NEXT: v_mov_b32_e32 v3, s11 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 @@ -328,14 +330,14 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { ; GFX9V5-LABEL: llvm_amdgcn_queue_ptr: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: v_mov_b32_e32 v2, 0 -; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc -; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc ; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc -; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1 -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) -; GFX9V5-NEXT: v_mov_b32_e32 v0, s8 -; GFX9V5-NEXT: v_mov_b32_e32 v1, s9 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V5-NEXT: ; kill: killed $sgpr6_sgpr7 ; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 603f457f3e05e4..6fed9889838cd8 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; GENERIC-LABEL: extract_w_offset: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000 @@ -80,8 +80,8 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; ; NOOPT-LABEL: extract_w_offset: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -148,14 +148,14 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; ; SI-MOVREL-LABEL: extract_w_offset: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1 -; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1 +; SI-MOVREL-NEXT: s_mov_b32 m0, s6 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 @@ -177,14 +177,14 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; ; VI-MOVREL-LABEL: extract_w_offset: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_add_i32 s4, s4, 1 -; VI-MOVREL-NEXT: s_mov_b32 m0, s4 +; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -206,13 +206,13 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; ; VI-IDXMODE-LABEL: extract_w_offset: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; VI-IDXMODE-NEXT: s_add_i32 s4, s4, 1 +; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 1 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -226,7 +226,7 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 -; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 @@ -236,13 +236,13 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-IDXMODE-LABEL: extract_w_offset: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 1 +; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 @@ -257,7 +257,7 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off ; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] @@ -273,58 +273,58 @@ entry: define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) { ; GENERIC-LABEL: extract_w_offset_salu_use_vector: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dword s20, s[2:3], 0xb -; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dword s2, s[4:5], 0xb +; GENERIC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_add_i32 s20, s20, 1 -; GENERIC-NEXT: s_or_b32 s2, s19, 16 -; GENERIC-NEXT: s_or_b32 s18, s18, 15 -; GENERIC-NEXT: s_or_b32 s17, s17, 14 -; GENERIC-NEXT: s_or_b32 s16, s16, 13 -; GENERIC-NEXT: s_or_b32 s15, s15, 12 -; GENERIC-NEXT: s_or_b32 s14, s14, 11 -; GENERIC-NEXT: s_or_b32 s13, s13, 10 -; GENERIC-NEXT: s_or_b32 s12, s12, 9 -; GENERIC-NEXT: s_or_b32 s11, s11, 8 -; GENERIC-NEXT: s_or_b32 s10, s10, 7 -; GENERIC-NEXT: s_or_b32 s9, s9, 6 -; GENERIC-NEXT: s_or_b32 s8, s8, 5 -; GENERIC-NEXT: s_or_b32 s7, s7, 4 -; GENERIC-NEXT: s_or_b32 s6, s6, 3 -; GENERIC-NEXT: s_or_b32 s4, s4, 1 -; GENERIC-NEXT: s_or_b32 s5, s5, 2 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 1 -; GENERIC-NEXT: s_cselect_b32 s4, s5, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 2 -; GENERIC-NEXT: s_cselect_b32 s4, s6, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 3 -; GENERIC-NEXT: s_cselect_b32 s4, s7, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 4 -; GENERIC-NEXT: s_cselect_b32 s4, s8, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 5 -; GENERIC-NEXT: s_cselect_b32 s4, s9, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 6 -; GENERIC-NEXT: s_cselect_b32 s4, s10, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 7 -; GENERIC-NEXT: s_cselect_b32 s4, s11, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 8 -; GENERIC-NEXT: s_cselect_b32 s4, s12, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 9 -; GENERIC-NEXT: s_cselect_b32 s4, s13, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 10 -; GENERIC-NEXT: s_cselect_b32 s4, s14, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 11 -; GENERIC-NEXT: s_cselect_b32 s4, s15, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 12 -; GENERIC-NEXT: s_cselect_b32 s4, s16, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 13 -; GENERIC-NEXT: s_cselect_b32 s4, s17, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 14 -; GENERIC-NEXT: s_cselect_b32 s4, s18, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 15 -; GENERIC-NEXT: s_cselect_b32 s4, s2, s4 +; GENERIC-NEXT: s_add_i32 s2, s2, 1 +; GENERIC-NEXT: s_or_b32 s4, s23, 16 +; GENERIC-NEXT: s_or_b32 s5, s22, 15 +; GENERIC-NEXT: s_or_b32 s6, s21, 14 +; GENERIC-NEXT: s_or_b32 s7, s20, 13 +; GENERIC-NEXT: s_or_b32 s19, s19, 12 +; GENERIC-NEXT: s_or_b32 s18, s18, 11 +; GENERIC-NEXT: s_or_b32 s17, s17, 10 +; GENERIC-NEXT: s_or_b32 s16, s16, 9 +; GENERIC-NEXT: s_or_b32 s15, s15, 8 +; GENERIC-NEXT: s_or_b32 s14, s14, 7 +; GENERIC-NEXT: s_or_b32 s13, s13, 6 +; GENERIC-NEXT: s_or_b32 s12, s12, 5 +; GENERIC-NEXT: s_or_b32 s11, s11, 4 +; GENERIC-NEXT: s_or_b32 s10, s10, 3 +; GENERIC-NEXT: s_or_b32 s8, s8, 1 +; GENERIC-NEXT: s_or_b32 s9, s9, 2 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 1 +; GENERIC-NEXT: s_cselect_b32 s8, s9, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 2 +; GENERIC-NEXT: s_cselect_b32 s8, s10, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 3 +; GENERIC-NEXT: s_cselect_b32 s8, s11, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 4 +; GENERIC-NEXT: s_cselect_b32 s8, s12, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 5 +; GENERIC-NEXT: s_cselect_b32 s8, s13, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 6 +; GENERIC-NEXT: s_cselect_b32 s8, s14, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 7 +; GENERIC-NEXT: s_cselect_b32 s8, s15, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 8 +; GENERIC-NEXT: s_cselect_b32 s8, s16, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 9 +; GENERIC-NEXT: s_cselect_b32 s8, s17, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 10 +; GENERIC-NEXT: s_cselect_b32 s8, s18, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 11 +; GENERIC-NEXT: s_cselect_b32 s8, s19, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 12 +; GENERIC-NEXT: s_cselect_b32 s7, s7, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 13 +; GENERIC-NEXT: s_cselect_b32 s6, s6, s7 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 14 +; GENERIC-NEXT: s_cselect_b32 s5, s5, s6 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 15 +; GENERIC-NEXT: s_cselect_b32 s4, s4, s5 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v0, s4 ; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -332,6 +332,7 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; ; NOOPT-LABEL: extract_w_offset_salu_use_vector: ; NOOPT: ; %bb.0: ; %entry +; NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5] ; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb ; NOOPT-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x19 @@ -432,90 +433,90 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; ; SI-MOVREL-LABEL: extract_w_offset_salu_use_vector: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0xb -; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s20, s20, 1 -; SI-MOVREL-NEXT: s_or_b32 s4, s4, 1 -; SI-MOVREL-NEXT: s_or_b32 s19, s19, 16 -; SI-MOVREL-NEXT: s_or_b32 s18, s18, 15 -; SI-MOVREL-NEXT: s_or_b32 s17, s17, 14 -; SI-MOVREL-NEXT: s_or_b32 s16, s16, 13 -; SI-MOVREL-NEXT: s_or_b32 s15, s15, 12 -; SI-MOVREL-NEXT: s_or_b32 s14, s14, 11 -; SI-MOVREL-NEXT: s_or_b32 s13, s13, 10 -; SI-MOVREL-NEXT: s_or_b32 s12, s12, 9 -; SI-MOVREL-NEXT: s_or_b32 s11, s11, 8 -; SI-MOVREL-NEXT: s_or_b32 s10, s10, 7 -; SI-MOVREL-NEXT: s_or_b32 s9, s9, 6 -; SI-MOVREL-NEXT: s_or_b32 s8, s8, 5 -; SI-MOVREL-NEXT: s_or_b32 s7, s7, 4 -; SI-MOVREL-NEXT: s_or_b32 s6, s6, 3 -; SI-MOVREL-NEXT: s_or_b32 s5, s5, 2 -; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 -; SI-MOVREL-NEXT: s_mov_b32 m0, s20 -; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 -; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 -; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 -; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 -; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 -; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 -; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 -; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 -; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 -; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 -; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 -; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 -; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1 +; SI-MOVREL-NEXT: s_or_b32 s8, s8, 1 +; SI-MOVREL-NEXT: s_or_b32 s4, s23, 16 +; SI-MOVREL-NEXT: s_or_b32 s5, s22, 15 +; SI-MOVREL-NEXT: s_or_b32 s7, s21, 14 +; SI-MOVREL-NEXT: s_or_b32 s20, s20, 13 +; SI-MOVREL-NEXT: s_or_b32 s19, s19, 12 +; SI-MOVREL-NEXT: s_or_b32 s18, s18, 11 +; SI-MOVREL-NEXT: s_or_b32 s17, s17, 10 +; SI-MOVREL-NEXT: s_or_b32 s16, s16, 9 +; SI-MOVREL-NEXT: s_or_b32 s15, s15, 8 +; SI-MOVREL-NEXT: s_or_b32 s14, s14, 7 +; SI-MOVREL-NEXT: s_or_b32 s13, s13, 6 +; SI-MOVREL-NEXT: s_or_b32 s12, s12, 5 +; SI-MOVREL-NEXT: s_or_b32 s11, s11, 4 +; SI-MOVREL-NEXT: s_or_b32 s10, s10, 3 +; SI-MOVREL-NEXT: s_or_b32 s9, s9, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 +; SI-MOVREL-NEXT: s_mov_b32 m0, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s20 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s5 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s4 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-MOVREL-NEXT: s_endpgm ; ; VI-MOVREL-LABEL: extract_w_offset_salu_use_vector: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0x2c -; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_add_i32 s20, s20, 1 -; VI-MOVREL-NEXT: s_or_b32 s6, s6, 3 -; VI-MOVREL-NEXT: s_or_b32 s5, s5, 2 -; VI-MOVREL-NEXT: s_or_b32 s4, s4, 1 -; VI-MOVREL-NEXT: s_or_b32 s2, s19, 16 -; VI-MOVREL-NEXT: s_or_b32 s3, s18, 15 -; VI-MOVREL-NEXT: s_or_b32 s17, s17, 14 -; VI-MOVREL-NEXT: s_or_b32 s16, s16, 13 -; VI-MOVREL-NEXT: s_or_b32 s15, s15, 12 -; VI-MOVREL-NEXT: s_or_b32 s14, s14, 11 -; VI-MOVREL-NEXT: s_or_b32 s13, s13, 10 -; VI-MOVREL-NEXT: s_or_b32 s12, s12, 9 -; VI-MOVREL-NEXT: s_or_b32 s11, s11, 8 -; VI-MOVREL-NEXT: s_or_b32 s10, s10, 7 -; VI-MOVREL-NEXT: s_or_b32 s9, s9, 6 -; VI-MOVREL-NEXT: s_or_b32 s8, s8, 5 -; VI-MOVREL-NEXT: s_or_b32 s7, s7, 4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 -; VI-MOVREL-NEXT: s_mov_b32 m0, s20 -; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 -; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 -; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 -; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 -; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 -; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 -; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 -; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 -; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s3 -; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s2 +; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 +; VI-MOVREL-NEXT: s_or_b32 s10, s10, 3 +; VI-MOVREL-NEXT: s_or_b32 s9, s9, 2 +; VI-MOVREL-NEXT: s_or_b32 s8, s8, 1 +; VI-MOVREL-NEXT: s_or_b32 s3, s23, 16 +; VI-MOVREL-NEXT: s_or_b32 s4, s22, 15 +; VI-MOVREL-NEXT: s_or_b32 s5, s21, 14 +; VI-MOVREL-NEXT: s_or_b32 s6, s20, 13 +; VI-MOVREL-NEXT: s_or_b32 s7, s19, 12 +; VI-MOVREL-NEXT: s_or_b32 s18, s18, 11 +; VI-MOVREL-NEXT: s_or_b32 s17, s17, 10 +; VI-MOVREL-NEXT: s_or_b32 s16, s16, 9 +; VI-MOVREL-NEXT: s_or_b32 s15, s15, 8 +; VI-MOVREL-NEXT: s_or_b32 s14, s14, 7 +; VI-MOVREL-NEXT: s_or_b32 s13, s13, 6 +; VI-MOVREL-NEXT: s_or_b32 s12, s12, 5 +; VI-MOVREL-NEXT: s_or_b32 s11, s11, 4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s17 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s3 ; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 @@ -524,44 +525,44 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; ; VI-IDXMODE-LABEL: extract_w_offset_salu_use_vector: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0x2c -; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; VI-IDXMODE-NEXT: s_add_i32 s20, s20, 1 -; VI-IDXMODE-NEXT: s_or_b32 s6, s6, 3 -; VI-IDXMODE-NEXT: s_or_b32 s5, s5, 2 -; VI-IDXMODE-NEXT: s_or_b32 s4, s4, 1 -; VI-IDXMODE-NEXT: s_or_b32 s2, s19, 16 -; VI-IDXMODE-NEXT: s_or_b32 s3, s18, 15 -; VI-IDXMODE-NEXT: s_or_b32 s17, s17, 14 -; VI-IDXMODE-NEXT: s_or_b32 s16, s16, 13 -; VI-IDXMODE-NEXT: s_or_b32 s15, s15, 12 -; VI-IDXMODE-NEXT: s_or_b32 s14, s14, 11 -; VI-IDXMODE-NEXT: s_or_b32 s13, s13, 10 -; VI-IDXMODE-NEXT: s_or_b32 s12, s12, 9 -; VI-IDXMODE-NEXT: s_or_b32 s11, s11, 8 -; VI-IDXMODE-NEXT: s_or_b32 s10, s10, 7 -; VI-IDXMODE-NEXT: s_or_b32 s9, s9, 6 -; VI-IDXMODE-NEXT: s_or_b32 s8, s8, 5 -; VI-IDXMODE-NEXT: s_or_b32 s7, s7, 4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s3 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s2 -; VI-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 1 +; VI-IDXMODE-NEXT: s_or_b32 s10, s10, 3 +; VI-IDXMODE-NEXT: s_or_b32 s9, s9, 2 +; VI-IDXMODE-NEXT: s_or_b32 s8, s8, 1 +; VI-IDXMODE-NEXT: s_or_b32 s3, s23, 16 +; VI-IDXMODE-NEXT: s_or_b32 s4, s22, 15 +; VI-IDXMODE-NEXT: s_or_b32 s5, s21, 14 +; VI-IDXMODE-NEXT: s_or_b32 s6, s20, 13 +; VI-IDXMODE-NEXT: s_or_b32 s7, s19, 12 +; VI-IDXMODE-NEXT: s_or_b32 s18, s18, 11 +; VI-IDXMODE-NEXT: s_or_b32 s17, s17, 10 +; VI-IDXMODE-NEXT: s_or_b32 s16, s16, 9 +; VI-IDXMODE-NEXT: s_or_b32 s15, s15, 8 +; VI-IDXMODE-NEXT: s_or_b32 s14, s14, 7 +; VI-IDXMODE-NEXT: s_or_b32 s13, s13, 6 +; VI-IDXMODE-NEXT: s_or_b32 s12, s12, 5 +; VI-IDXMODE-NEXT: s_or_b32 s11, s11, 4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s17 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s18 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s3 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 @@ -571,45 +572,45 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; ; GFX9-IDXMODE-LABEL: extract_w_offset_salu_use_vector: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0x2c -; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_add_i32 s20, s20, 1 -; GFX9-IDXMODE-NEXT: s_or_b32 s4, s4, 1 -; GFX9-IDXMODE-NEXT: s_or_b32 s2, s19, 16 -; GFX9-IDXMODE-NEXT: s_or_b32 s3, s18, 15 -; GFX9-IDXMODE-NEXT: s_or_b32 s17, s17, 14 -; GFX9-IDXMODE-NEXT: s_or_b32 s16, s16, 13 -; GFX9-IDXMODE-NEXT: s_or_b32 s15, s15, 12 -; GFX9-IDXMODE-NEXT: s_or_b32 s14, s14, 11 -; GFX9-IDXMODE-NEXT: s_or_b32 s13, s13, 10 -; GFX9-IDXMODE-NEXT: s_or_b32 s12, s12, 9 -; GFX9-IDXMODE-NEXT: s_or_b32 s11, s11, 8 -; GFX9-IDXMODE-NEXT: s_or_b32 s10, s10, 7 -; GFX9-IDXMODE-NEXT: s_or_b32 s9, s9, 6 -; GFX9-IDXMODE-NEXT: s_or_b32 s8, s8, 5 -; GFX9-IDXMODE-NEXT: s_or_b32 s7, s7, 4 -; GFX9-IDXMODE-NEXT: s_or_b32 s6, s6, 3 -; GFX9-IDXMODE-NEXT: s_or_b32 s5, s5, 2 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s3 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s2 -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 1 +; GFX9-IDXMODE-NEXT: s_or_b32 s8, s8, 1 +; GFX9-IDXMODE-NEXT: s_or_b32 s3, s23, 16 +; GFX9-IDXMODE-NEXT: s_or_b32 s4, s22, 15 +; GFX9-IDXMODE-NEXT: s_or_b32 s5, s21, 14 +; GFX9-IDXMODE-NEXT: s_or_b32 s6, s20, 13 +; GFX9-IDXMODE-NEXT: s_or_b32 s7, s19, 12 +; GFX9-IDXMODE-NEXT: s_or_b32 s18, s18, 11 +; GFX9-IDXMODE-NEXT: s_or_b32 s17, s17, 10 +; GFX9-IDXMODE-NEXT: s_or_b32 s16, s16, 9 +; GFX9-IDXMODE-NEXT: s_or_b32 s15, s15, 8 +; GFX9-IDXMODE-NEXT: s_or_b32 s14, s14, 7 +; GFX9-IDXMODE-NEXT: s_or_b32 s13, s13, 6 +; GFX9-IDXMODE-NEXT: s_or_b32 s12, s12, 5 +; GFX9-IDXMODE-NEXT: s_or_b32 s11, s11, 4 +; GFX9-IDXMODE-NEXT: s_or_b32 s10, s10, 3 +; GFX9-IDXMODE-NEXT: s_or_b32 s9, s9, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s3 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off ; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] @@ -625,8 +626,8 @@ entry: define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) { ; GENERIC-LABEL: extract_wo_offset: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GENERIC-NEXT: s_load_dword s6, s[2:3], 0xb +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_load_dword s6, s[4:5], 0xb ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000 @@ -693,8 +694,8 @@ define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) { ; ; NOOPT-LABEL: extract_wo_offset: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -759,8 +760,8 @@ define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) { ; ; SI-MOVREL-LABEL: extract_wo_offset: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-MOVREL-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 @@ -787,8 +788,8 @@ define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) { ; ; VI-MOVREL-LABEL: extract_wo_offset: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 @@ -815,8 +816,8 @@ define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) { ; ; VI-IDXMODE-LABEL: extract_wo_offset: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 @@ -844,8 +845,8 @@ define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-IDXMODE-LABEL: extract_wo_offset: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 @@ -864,7 +865,7 @@ define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off ; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] @@ -878,11 +879,11 @@ entry: define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %offset) { ; GENERIC-LABEL: extract_neg_offset_sgpr: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dword s2, s[4:5], 0xb +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_add_i32 s2, s4, 0xfffffe00 +; GENERIC-NEXT: s_addk_i32 s2, 0xfe00 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 1 ; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GENERIC-NEXT: s_cmp_lg_u32 s2, 2 @@ -922,8 +923,8 @@ define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %o ; ; NOOPT-LABEL: extract_neg_offset_sgpr: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -988,8 +989,8 @@ define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %o ; ; SI-MOVREL-LABEL: extract_neg_offset_sgpr: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-MOVREL-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 0 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 @@ -1016,8 +1017,8 @@ define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %o ; ; VI-MOVREL-LABEL: extract_neg_offset_sgpr: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 2 @@ -1044,8 +1045,8 @@ define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %o ; ; VI-IDXMODE-LABEL: extract_neg_offset_sgpr: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 @@ -1074,14 +1075,14 @@ define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %o ; ; GFX9-IDXMODE-LABEL: extract_neg_offset_sgpr: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_addk_i32 s4, 0xfe00 +; GFX9-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 5 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 6 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 7 @@ -1094,7 +1095,7 @@ define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %o ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 14 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 15 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 16 -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off ; GFX9-IDXMODE-NEXT: global_store_dword v0, v1, s[0:1] @@ -1109,59 +1110,59 @@ entry: define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) { ; GENERIC-LABEL: extract_neg_offset_sgpr_loaded: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; GENERIC-NEXT: s_load_dword s20, s[2:3], 0x39 -; GENERIC-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x29 -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; GENERIC-NEXT: s_load_dword s2, s[4:5], 0x39 +; GENERIC-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x29 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_addk_i32 s20, 0xfe00 -; GENERIC-NEXT: s_or_b32 s2, s19, s51 -; GENERIC-NEXT: s_or_b32 s18, s18, s50 -; GENERIC-NEXT: s_or_b32 s17, s17, s49 -; GENERIC-NEXT: s_or_b32 s16, s16, s48 -; GENERIC-NEXT: s_or_b32 s15, s15, s47 -; GENERIC-NEXT: s_or_b32 s14, s14, s46 -; GENERIC-NEXT: s_or_b32 s13, s13, s45 -; GENERIC-NEXT: s_or_b32 s12, s12, s44 -; GENERIC-NEXT: s_or_b32 s11, s11, s43 -; GENERIC-NEXT: s_or_b32 s10, s10, s42 -; GENERIC-NEXT: s_or_b32 s9, s9, s41 -; GENERIC-NEXT: s_or_b32 s8, s8, s40 -; GENERIC-NEXT: s_or_b32 s7, s7, s39 -; GENERIC-NEXT: s_or_b32 s6, s6, s38 -; GENERIC-NEXT: s_or_b32 s4, s4, s36 -; GENERIC-NEXT: s_or_b32 s5, s5, s37 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 1 -; GENERIC-NEXT: s_cselect_b32 s4, s5, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 2 -; GENERIC-NEXT: s_cselect_b32 s4, s6, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 3 -; GENERIC-NEXT: s_cselect_b32 s4, s7, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 4 -; GENERIC-NEXT: s_cselect_b32 s4, s8, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 5 -; GENERIC-NEXT: s_cselect_b32 s4, s9, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 6 -; GENERIC-NEXT: s_cselect_b32 s4, s10, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 7 -; GENERIC-NEXT: s_cselect_b32 s4, s11, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 8 -; GENERIC-NEXT: s_cselect_b32 s4, s12, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 9 -; GENERIC-NEXT: s_cselect_b32 s4, s13, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 10 -; GENERIC-NEXT: s_cselect_b32 s4, s14, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 11 -; GENERIC-NEXT: s_cselect_b32 s4, s15, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 12 -; GENERIC-NEXT: s_cselect_b32 s4, s16, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 13 -; GENERIC-NEXT: s_cselect_b32 s4, s17, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 14 -; GENERIC-NEXT: s_cselect_b32 s4, s18, s4 -; GENERIC-NEXT: s_cmp_eq_u32 s20, 15 -; GENERIC-NEXT: s_cselect_b32 s4, s2, s4 +; GENERIC-NEXT: s_addk_i32 s2, 0xfe00 +; GENERIC-NEXT: s_or_b32 s4, s23, s51 +; GENERIC-NEXT: s_or_b32 s5, s22, s50 +; GENERIC-NEXT: s_or_b32 s6, s21, s49 +; GENERIC-NEXT: s_or_b32 s7, s20, s48 +; GENERIC-NEXT: s_or_b32 s19, s19, s47 +; GENERIC-NEXT: s_or_b32 s18, s18, s46 +; GENERIC-NEXT: s_or_b32 s17, s17, s45 +; GENERIC-NEXT: s_or_b32 s16, s16, s44 +; GENERIC-NEXT: s_or_b32 s15, s15, s43 +; GENERIC-NEXT: s_or_b32 s14, s14, s42 +; GENERIC-NEXT: s_or_b32 s13, s13, s41 +; GENERIC-NEXT: s_or_b32 s12, s12, s40 +; GENERIC-NEXT: s_or_b32 s11, s11, s39 +; GENERIC-NEXT: s_or_b32 s10, s10, s38 +; GENERIC-NEXT: s_or_b32 s8, s8, s36 +; GENERIC-NEXT: s_or_b32 s9, s9, s37 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 1 +; GENERIC-NEXT: s_cselect_b32 s8, s9, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 2 +; GENERIC-NEXT: s_cselect_b32 s8, s10, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 3 +; GENERIC-NEXT: s_cselect_b32 s8, s11, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 4 +; GENERIC-NEXT: s_cselect_b32 s8, s12, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 5 +; GENERIC-NEXT: s_cselect_b32 s8, s13, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 6 +; GENERIC-NEXT: s_cselect_b32 s8, s14, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 7 +; GENERIC-NEXT: s_cselect_b32 s8, s15, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 8 +; GENERIC-NEXT: s_cselect_b32 s8, s16, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 9 +; GENERIC-NEXT: s_cselect_b32 s8, s17, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 10 +; GENERIC-NEXT: s_cselect_b32 s8, s18, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 11 +; GENERIC-NEXT: s_cselect_b32 s8, s19, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 12 +; GENERIC-NEXT: s_cselect_b32 s7, s7, s8 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 13 +; GENERIC-NEXT: s_cselect_b32 s6, s6, s7 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 14 +; GENERIC-NEXT: s_cselect_b32 s5, s5, s6 +; GENERIC-NEXT: s_cmp_eq_u32 s2, 15 +; GENERIC-NEXT: s_cselect_b32 s4, s4, s5 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v0, s4 ; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1169,10 +1170,10 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; ; NOOPT-LABEL: extract_neg_offset_sgpr_loaded: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x19 -; NOOPT-NEXT: s_load_dwordx16 s[52:67], s[2:3], 0x29 -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0x39 +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x19 +; NOOPT-NEXT: s_load_dwordx16 s[52:67], s[4:5], 0x29 +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0x39 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1269,89 +1270,89 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; ; SI-MOVREL-LABEL: extract_neg_offset_sgpr_loaded: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-MOVREL-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x29 -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0x39 +; SI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x29 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-MOVREL-NEXT: s_load_dword s4, s[4:5], 0x39 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_or_b32 s4, s4, s36 -; SI-MOVREL-NEXT: s_or_b32 s19, s19, s51 -; SI-MOVREL-NEXT: s_or_b32 s18, s18, s50 -; SI-MOVREL-NEXT: s_or_b32 s17, s17, s49 -; SI-MOVREL-NEXT: s_or_b32 s16, s16, s48 -; SI-MOVREL-NEXT: s_or_b32 s15, s15, s47 -; SI-MOVREL-NEXT: s_or_b32 s14, s14, s46 -; SI-MOVREL-NEXT: s_or_b32 s13, s13, s45 -; SI-MOVREL-NEXT: s_or_b32 s12, s12, s44 -; SI-MOVREL-NEXT: s_or_b32 s11, s11, s43 -; SI-MOVREL-NEXT: s_or_b32 s10, s10, s42 -; SI-MOVREL-NEXT: s_or_b32 s9, s9, s41 -; SI-MOVREL-NEXT: s_or_b32 s8, s8, s40 -; SI-MOVREL-NEXT: s_or_b32 s7, s7, s39 -; SI-MOVREL-NEXT: s_or_b32 s6, s6, s38 -; SI-MOVREL-NEXT: s_or_b32 s5, s5, s37 -; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 -; SI-MOVREL-NEXT: s_add_i32 m0, s20, 0xfffffe00 -; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 -; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 -; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 -; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 -; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 -; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 -; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 -; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 -; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 -; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 -; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 -; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 -; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; SI-MOVREL-NEXT: s_or_b32 s8, s8, s36 +; SI-MOVREL-NEXT: s_or_b32 s5, s23, s51 +; SI-MOVREL-NEXT: s_or_b32 s6, s22, s50 +; SI-MOVREL-NEXT: s_or_b32 s7, s21, s49 +; SI-MOVREL-NEXT: s_or_b32 s20, s20, s48 +; SI-MOVREL-NEXT: s_or_b32 s19, s19, s47 +; SI-MOVREL-NEXT: s_or_b32 s18, s18, s46 +; SI-MOVREL-NEXT: s_or_b32 s17, s17, s45 +; SI-MOVREL-NEXT: s_or_b32 s16, s16, s44 +; SI-MOVREL-NEXT: s_or_b32 s15, s15, s43 +; SI-MOVREL-NEXT: s_or_b32 s14, s14, s42 +; SI-MOVREL-NEXT: s_or_b32 s13, s13, s41 +; SI-MOVREL-NEXT: s_or_b32 s12, s12, s40 +; SI-MOVREL-NEXT: s_or_b32 s11, s11, s39 +; SI-MOVREL-NEXT: s_or_b32 s10, s10, s38 +; SI-MOVREL-NEXT: s_or_b32 s9, s9, s37 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 +; SI-MOVREL-NEXT: s_add_i32 m0, s4, 0xfffffe00 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s20 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s5 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-MOVREL-NEXT: s_endpgm ; ; VI-MOVREL-LABEL: extract_neg_offset_sgpr_loaded: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-MOVREL-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0xe4 +; VI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-MOVREL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0xe4 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_or_b32 s6, s6, s38 -; VI-MOVREL-NEXT: s_or_b32 s5, s5, s37 -; VI-MOVREL-NEXT: s_or_b32 s4, s4, s36 -; VI-MOVREL-NEXT: s_or_b32 s3, s19, s51 -; VI-MOVREL-NEXT: s_or_b32 s18, s18, s50 -; VI-MOVREL-NEXT: s_or_b32 s17, s17, s49 -; VI-MOVREL-NEXT: s_or_b32 s16, s16, s48 -; VI-MOVREL-NEXT: s_or_b32 s15, s15, s47 -; VI-MOVREL-NEXT: s_or_b32 s14, s14, s46 -; VI-MOVREL-NEXT: s_or_b32 s13, s13, s45 -; VI-MOVREL-NEXT: s_or_b32 s12, s12, s44 -; VI-MOVREL-NEXT: s_or_b32 s11, s11, s43 -; VI-MOVREL-NEXT: s_or_b32 s10, s10, s42 -; VI-MOVREL-NEXT: s_or_b32 s9, s9, s41 -; VI-MOVREL-NEXT: s_or_b32 s8, s8, s40 -; VI-MOVREL-NEXT: s_or_b32 s7, s7, s39 -; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; VI-MOVREL-NEXT: s_or_b32 s10, s10, s38 +; VI-MOVREL-NEXT: s_or_b32 s9, s9, s37 +; VI-MOVREL-NEXT: s_or_b32 s8, s8, s36 +; VI-MOVREL-NEXT: s_or_b32 s3, s23, s51 +; VI-MOVREL-NEXT: s_or_b32 s4, s22, s50 +; VI-MOVREL-NEXT: s_or_b32 s5, s21, s49 +; VI-MOVREL-NEXT: s_or_b32 s6, s20, s48 +; VI-MOVREL-NEXT: s_or_b32 s7, s19, s47 +; VI-MOVREL-NEXT: s_or_b32 s18, s18, s46 +; VI-MOVREL-NEXT: s_or_b32 s17, s17, s45 +; VI-MOVREL-NEXT: s_or_b32 s16, s16, s44 +; VI-MOVREL-NEXT: s_or_b32 s15, s15, s43 +; VI-MOVREL-NEXT: s_or_b32 s14, s14, s42 +; VI-MOVREL-NEXT: s_or_b32 s13, s13, s41 +; VI-MOVREL-NEXT: s_or_b32 s12, s12, s40 +; VI-MOVREL-NEXT: s_or_b32 s11, s11, s39 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 ; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00 -; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 -; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 -; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 -; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 -; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 -; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 -; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 -; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 -; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s17 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s4 ; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s3 ; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 @@ -1361,43 +1362,43 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; ; VI-IDXMODE-LABEL: extract_neg_offset_sgpr_loaded: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-IDXMODE-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0xe4 +; VI-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-IDXMODE-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0xe4 ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; VI-IDXMODE-NEXT: s_or_b32 s6, s6, s38 -; VI-IDXMODE-NEXT: s_or_b32 s5, s5, s37 -; VI-IDXMODE-NEXT: s_or_b32 s4, s4, s36 -; VI-IDXMODE-NEXT: s_or_b32 s3, s19, s51 -; VI-IDXMODE-NEXT: s_or_b32 s18, s18, s50 -; VI-IDXMODE-NEXT: s_or_b32 s17, s17, s49 -; VI-IDXMODE-NEXT: s_or_b32 s16, s16, s48 -; VI-IDXMODE-NEXT: s_or_b32 s15, s15, s47 -; VI-IDXMODE-NEXT: s_or_b32 s14, s14, s46 -; VI-IDXMODE-NEXT: s_or_b32 s13, s13, s45 -; VI-IDXMODE-NEXT: s_or_b32 s12, s12, s44 -; VI-IDXMODE-NEXT: s_or_b32 s11, s11, s43 -; VI-IDXMODE-NEXT: s_or_b32 s10, s10, s42 -; VI-IDXMODE-NEXT: s_or_b32 s9, s9, s41 -; VI-IDXMODE-NEXT: s_or_b32 s8, s8, s40 -; VI-IDXMODE-NEXT: s_or_b32 s7, s7, s39 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 +; VI-IDXMODE-NEXT: s_or_b32 s10, s10, s38 +; VI-IDXMODE-NEXT: s_or_b32 s9, s9, s37 +; VI-IDXMODE-NEXT: s_or_b32 s8, s8, s36 +; VI-IDXMODE-NEXT: s_or_b32 s3, s23, s51 +; VI-IDXMODE-NEXT: s_or_b32 s4, s22, s50 +; VI-IDXMODE-NEXT: s_or_b32 s5, s21, s49 +; VI-IDXMODE-NEXT: s_or_b32 s6, s20, s48 +; VI-IDXMODE-NEXT: s_or_b32 s7, s19, s47 +; VI-IDXMODE-NEXT: s_or_b32 s18, s18, s46 +; VI-IDXMODE-NEXT: s_or_b32 s17, s17, s45 +; VI-IDXMODE-NEXT: s_or_b32 s16, s16, s44 +; VI-IDXMODE-NEXT: s_or_b32 s15, s15, s43 +; VI-IDXMODE-NEXT: s_or_b32 s14, s14, s42 +; VI-IDXMODE-NEXT: s_or_b32 s13, s13, s41 +; VI-IDXMODE-NEXT: s_or_b32 s12, s12, s40 +; VI-IDXMODE-NEXT: s_or_b32 s11, s11, s39 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s10 ; VI-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s17 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s18 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s7 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s6 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s5 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s4 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s3 ; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0 @@ -1409,46 +1410,46 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; ; GFX9-IDXMODE-LABEL: extract_neg_offset_sgpr_loaded: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xe4 +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0xe4 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_or_b32 s4, s4, s36 -; GFX9-IDXMODE-NEXT: s_or_b32 s2, s19, s51 -; GFX9-IDXMODE-NEXT: s_or_b32 s3, s18, s50 -; GFX9-IDXMODE-NEXT: s_or_b32 s17, s17, s49 -; GFX9-IDXMODE-NEXT: s_or_b32 s16, s16, s48 -; GFX9-IDXMODE-NEXT: s_or_b32 s15, s15, s47 -; GFX9-IDXMODE-NEXT: s_or_b32 s14, s14, s46 -; GFX9-IDXMODE-NEXT: s_or_b32 s13, s13, s45 -; GFX9-IDXMODE-NEXT: s_or_b32 s12, s12, s44 -; GFX9-IDXMODE-NEXT: s_or_b32 s11, s11, s43 -; GFX9-IDXMODE-NEXT: s_or_b32 s10, s10, s42 -; GFX9-IDXMODE-NEXT: s_or_b32 s9, s9, s41 -; GFX9-IDXMODE-NEXT: s_or_b32 s8, s8, s40 -; GFX9-IDXMODE-NEXT: s_or_b32 s7, s7, s39 -; GFX9-IDXMODE-NEXT: s_or_b32 s6, s6, s38 -; GFX9-IDXMODE-NEXT: s_or_b32 s5, s5, s37 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-IDXMODE-NEXT: s_addk_i32 s20, 0xfe00 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s3 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s2 -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: s_or_b32 s8, s8, s36 +; GFX9-IDXMODE-NEXT: s_or_b32 s3, s23, s51 +; GFX9-IDXMODE-NEXT: s_or_b32 s4, s22, s50 +; GFX9-IDXMODE-NEXT: s_or_b32 s5, s21, s49 +; GFX9-IDXMODE-NEXT: s_or_b32 s6, s20, s48 +; GFX9-IDXMODE-NEXT: s_or_b32 s7, s19, s47 +; GFX9-IDXMODE-NEXT: s_or_b32 s18, s18, s46 +; GFX9-IDXMODE-NEXT: s_or_b32 s17, s17, s45 +; GFX9-IDXMODE-NEXT: s_or_b32 s16, s16, s44 +; GFX9-IDXMODE-NEXT: s_or_b32 s15, s15, s43 +; GFX9-IDXMODE-NEXT: s_or_b32 s14, s14, s42 +; GFX9-IDXMODE-NEXT: s_or_b32 s13, s13, s41 +; GFX9-IDXMODE-NEXT: s_or_b32 s12, s12, s40 +; GFX9-IDXMODE-NEXT: s_or_b32 s11, s11, s39 +; GFX9-IDXMODE-NEXT: s_or_b32 s10, s10, s38 +; GFX9-IDXMODE-NEXT: s_or_b32 s9, s9, s37 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s6 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s3 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off ; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] @@ -1464,7 +1465,7 @@ entry: define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; GENERIC-LABEL: extract_neg_offset_vgpr: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_add_i32_e32 v0, vcc, 0xfffffe00, v0 @@ -1508,10 +1509,10 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; NOOPT-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; NOOPT-NEXT: s_mov_b32 s22, -1 ; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000 -; NOOPT-NEXT: s_add_u32 s20, s20, s9 +; NOOPT-NEXT: s_add_u32 s20, s20, s11 ; NOOPT-NEXT: s_addc_u32 s21, s21, 0 ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s6, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1688,7 +1689,7 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 11, v1, vcc ; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0 -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 12, v1, vcc ; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 13, v1, vcc @@ -1726,7 +1727,7 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 10, v1, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_cndmask_b32_e32 v1, 11, v1, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 12, v1, vcc @@ -1768,7 +1769,7 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) { ; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 11, v2, vcc ; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0 -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 12, v2, vcc ; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 13, v2, vcc @@ -1794,7 +1795,7 @@ entry: define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GENERIC-LABEL: extract_undef_offset_sgpr: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GENERIC-NEXT: s_mov_b32 s7, 0xf000 ; GENERIC-NEXT: s_mov_b32 s6, -1 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) @@ -1806,7 +1807,7 @@ define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr ; ; NOOPT-LABEL: extract_undef_offset_sgpr: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s6, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1822,7 +1823,7 @@ define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr ; ; SI-MOVREL-LABEL: extract_undef_offset_sgpr: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s6, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) @@ -1834,7 +1835,7 @@ define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr ; ; VI-LABEL: extract_undef_offset_sgpr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1844,10 +1845,10 @@ define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr ; ; GFX9-IDXMODE-LABEL: extract_undef_offset_sgpr: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; GFX9-IDXMODE-NEXT: s_endpgm entry: @@ -1888,8 +1889,8 @@ entry: define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; GENERIC-LABEL: insert_w_offset: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000 @@ -1966,8 +1967,8 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; ; NOOPT-LABEL: insert_w_offset: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2073,13 +2074,13 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; ; SI-MOVREL-LABEL: insert_w_offset: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -2094,7 +2095,7 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 -; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 m0, s6 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 @@ -2106,13 +2107,14 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; ; VI-MOVREL-LABEL: insert_w_offset: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 @@ -2128,7 +2130,6 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 -; VI-MOVREL-NEXT: s_mov_b32 m0, s4 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 ; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 @@ -2154,14 +2155,13 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; ; VI-IDXMODE-LABEL: insert_w_offset: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; VI-IDXMODE-NEXT: s_add_i32 s4, s4, 1 -; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 1 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -2176,10 +2176,11 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000 -; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v16 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 @@ -2203,13 +2204,13 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-IDXMODE-LABEL: insert_w_offset: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 1 +; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 @@ -2225,7 +2226,7 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000 -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off ; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 @@ -2243,8 +2244,8 @@ entry: define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %out, i16 %in) { ; GENERIC-LABEL: insert_unsigned_base_plus_offset: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000 @@ -2322,8 +2323,8 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; ; NOOPT-LABEL: insert_unsigned_base_plus_offset: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2429,13 +2430,13 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; ; SI-MOVREL-LABEL: insert_unsigned_base_plus_offset: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_and_b32 s4, s4, 0xffff +; SI-MOVREL-NEXT: s_and_b32 s4, s6, 0xffff ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -2462,13 +2463,13 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; ; VI-MOVREL-LABEL: insert_unsigned_base_plus_offset: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_and_b32 s2, s4, 0xffff +; VI-MOVREL-NEXT: s_and_b32 s2, s2, 0xffff ; VI-MOVREL-NEXT: s_mov_b32 m0, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 @@ -2510,13 +2511,13 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; ; VI-IDXMODE-LABEL: insert_unsigned_base_plus_offset: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; VI-IDXMODE-NEXT: s_and_b32 s2, s4, 0xffff +; VI-IDXMODE-NEXT: s_and_b32 s2, s2, 0xffff ; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -2559,13 +2560,13 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; ; GFX9-IDXMODE-LABEL: insert_unsigned_base_plus_offset: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-IDXMODE-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 @@ -2600,8 +2601,8 @@ entry: define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, i16 %in) { ; GENERIC-LABEL: insert_signed_base_plus_offset: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000 @@ -2679,8 +2680,8 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; ; NOOPT-LABEL: insert_signed_base_plus_offset: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2787,13 +2788,13 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; ; SI-MOVREL-LABEL: insert_signed_base_plus_offset: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_sext_i32_i16 s4, s4 +; SI-MOVREL-NEXT: s_sext_i32_i16 s4, s6 ; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 @@ -2821,13 +2822,13 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; ; VI-MOVREL-LABEL: insert_signed_base_plus_offset: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_sext_i32_i16 s2, s4 +; VI-MOVREL-NEXT: s_sext_i32_i16 s2, s2 ; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 ; VI-MOVREL-NEXT: s_mov_b32 m0, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 @@ -2870,13 +2871,13 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; ; VI-IDXMODE-LABEL: insert_signed_base_plus_offset: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; VI-IDXMODE-NEXT: s_sext_i32_i16 s2, s4 +; VI-IDXMODE-NEXT: s_sext_i32_i16 s2, s2 ; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 1 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 @@ -2920,13 +2921,13 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; ; GFX9-IDXMODE-LABEL: insert_signed_base_plus_offset: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_sext_i32_i16 s2, s4 +; GFX9-IDXMODE-NEXT: s_sext_i32_i16 s2, s2 ; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 @@ -2964,8 +2965,8 @@ entry: define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; GENERIC-LABEL: insert_wo_offset: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000 @@ -3041,8 +3042,8 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; ; NOOPT-LABEL: insert_wo_offset: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -3146,8 +3147,8 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; ; SI-MOVREL-LABEL: insert_wo_offset: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-MOVREL-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 @@ -3178,8 +3179,8 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; ; VI-MOVREL-LABEL: insert_wo_offset: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 @@ -3225,8 +3226,8 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; ; VI-IDXMODE-LABEL: insert_wo_offset: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 @@ -3273,8 +3274,8 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-IDXMODE-LABEL: insert_wo_offset: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 @@ -3294,7 +3295,7 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off ; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 @@ -3311,12 +3312,12 @@ entry: define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %offset) { ; GENERIC-LABEL: insert_neg_offset_sgpr: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xd -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GENERIC-NEXT: s_load_dword s6, s[4:5], 0xd +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_add_i32 s6, s4, 0xfffffe00 +; GENERIC-NEXT: s_addk_i32 s6, 0xfe00 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 0 ; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GENERIC-NEXT: s_cmp_eq_u32 s6, 3 @@ -3376,8 +3377,8 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; ; NOOPT-LABEL: insert_neg_offset_sgpr: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xd ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -3479,8 +3480,8 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; ; SI-MOVREL-LABEL: insert_neg_offset_sgpr: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xd +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; SI-MOVREL-NEXT: s_load_dword s4, s[4:5], 0xd ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 2 @@ -3510,8 +3511,8 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; ; VI-MOVREL-LABEL: insert_neg_offset_sgpr: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0x34 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 2 @@ -3556,8 +3557,8 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; ; VI-IDXMODE-LABEL: insert_neg_offset_sgpr: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x34 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 @@ -3604,8 +3605,8 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; ; GFX9-IDXMODE-LABEL: insert_neg_offset_sgpr: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x34 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 @@ -3624,7 +3625,7 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v15 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_addk_i32 s4, 0xfe00 +; GFX9-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v14 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v13 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v12 @@ -3640,7 +3641,7 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off ; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:48 @@ -3660,9 +3661,9 @@ entry: define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ptr addrspace(1) %out, <16 x i32> %vec, i32 %offset) { ; GENERIC-LABEL: insert_neg_offset_sgpr_loadreg: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0xb -; GENERIC-NEXT: s_load_dword s20, s[2:3], 0x29 -; GENERIC-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x19 +; GENERIC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb +; GENERIC-NEXT: s_load_dword s20, s[4:5], 0x29 +; GENERIC-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x19 ; GENERIC-NEXT: s_mov_b32 s19, 0xf000 ; GENERIC-NEXT: s_mov_b32 s18, -1 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) @@ -3725,9 +3726,9 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; ; NOOPT-LABEL: insert_neg_offset_sgpr_loadreg: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x19 -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0x29 +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0x29 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -3797,62 +3798,62 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; ; SI-MOVREL-LABEL: insert_neg_offset_sgpr_loadreg: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x29 -; SI-MOVREL-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0xb -; SI-MOVREL-NEXT: s_mov_b32 s23, 0xf000 -; SI-MOVREL-NEXT: s_mov_b32 s22, -1 +; SI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-MOVREL-NEXT: s_load_dword s6, s[4:5], 0x29 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 -; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 -; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 -; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 -; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 -; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 -; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 -; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 -; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 -; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 -; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 -; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 -; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 -; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 -; SI-MOVREL-NEXT: s_add_i32 m0, s0, 0xfffffe00 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s20 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s21 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s22 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s23 +; SI-MOVREL-NEXT: s_add_i32 m0, s6, 0xfffffe00 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, 5 -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:48 -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:32 -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-MOVREL-NEXT: s_endpgm ; ; VI-MOVREL-LABEL: insert_neg_offset_sgpr_loadreg: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0xa4 -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0xa4 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 -; VI-MOVREL-NEXT: s_add_i32 m0, s20, 0xfffffe00 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 -; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 -; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 -; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 -; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 -; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 -; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 -; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 -; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 -; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 -; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s17 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s19 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s20 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s21 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s22 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s23 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 5 @@ -3876,35 +3877,35 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; ; VI-IDXMODE-LABEL: insert_neg_offset_sgpr_loadreg: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4 -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0xa4 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 -; VI-IDXMODE-NEXT: s_addk_i32 s20, 0xfe00 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s8 +; VI-IDXMODE-NEXT: s_add_i32 s3, s2, 0xfffffe00 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s17 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s18 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s19 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s20 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s21 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s22 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s23 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 5 +; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s19 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 -; VI-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(DST) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 5 -; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 @@ -3925,29 +3926,29 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; ; GFX9-IDXMODE-LABEL: insert_neg_offset_sgpr_loadreg: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4 -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0xa4 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s19 -; GFX9-IDXMODE-NEXT: s_addk_i32 s20, 0xfe00 -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s19 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s20 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s21 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off ; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 @@ -3965,7 +3966,7 @@ entry: define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GENERIC-LABEL: insert_neg_offset_vgpr: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_add_i32_e32 v12, vcc, 0xfffffe00, v0 @@ -4014,10 +4015,10 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; NOOPT-NEXT: s_mov_b32 s22, -1 ; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000 -; NOOPT-NEXT: s_add_u32 s20, s20, s9 +; NOOPT-NEXT: s_add_u32 s20, s20, s11 ; NOOPT-NEXT: s_addc_u32 s21, s21, 0 ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s6, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -4320,7 +4321,7 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 ; SI-MOVREL-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-MOVREL-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 ; SI-MOVREL-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc @@ -4355,7 +4356,7 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 ; VI-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; VI-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 ; VI-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc @@ -4425,7 +4426,7 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc @@ -4453,7 +4454,7 @@ entry: define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GENERIC-LABEL: insert_neg_inline_offset_vgpr: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_add_i32_e32 v12, vcc, -16, v0 @@ -4503,10 +4504,10 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; NOOPT-NEXT: s_mov_b32 s22, -1 ; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000 -; NOOPT-NEXT: s_add_u32 s20, s20, s9 +; NOOPT-NEXT: s_add_u32 s20, s20, s11 ; NOOPT-NEXT: s_addc_u32 s21, s21, 0 ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s6, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -4810,7 +4811,7 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 10, v16, vcc ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 9, v16, vcc ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 16, v16, vcc @@ -4850,7 +4851,7 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v16, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; VI-NEXT: v_cndmask_b32_e32 v11, 12, v16, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 ; VI-NEXT: v_cndmask_b32_e32 v10, 11, v16, vcc @@ -4917,7 +4918,7 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 10, v17, vcc ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 9, v17, vcc ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 16, v17, vcc @@ -4947,17 +4948,17 @@ entry: define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) { ; GENERIC-LABEL: extract_vgpr_offset_multiple_in_block: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GENERIC-NEXT: s_mov_b32 s11, 0xf000 -; GENERIC-NEXT: s_mov_b32 s6, 0 +; GENERIC-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s10, 0 ; GENERIC-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GENERIC-NEXT: v_mov_b32_e32 v2, 0 -; GENERIC-NEXT: s_mov_b32 s7, s11 +; GENERIC-NEXT: s_mov_b32 s11, s3 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 glc +; GENERIC-NEXT: buffer_load_dword v1, v[1:2], s[8:11], 0 addr64 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; GENERIC-NEXT: s_mov_b32 s10, -1 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: ;;#ASMSTART ; GENERIC-NEXT: s_mov_b32 s4, 17 ; GENERIC-NEXT: ;;#ASMEND @@ -5023,16 +5024,16 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v2 ; GENERIC-NEXT: v_cndmask_b32_e32 v2, 16, v3, vcc ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; GENERIC-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; GENERIC-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GENERIC-NEXT: s_waitcnt vmcnt(0) ; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GENERIC-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GENERIC-NEXT: s_cbranch_execz .LBB16_2 ; GENERIC-NEXT: ; %bb.1: ; %bb1 ; GENERIC-NEXT: v_mov_b32_e32 v0, s4 -; GENERIC-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GENERIC-NEXT: s_waitcnt vmcnt(0) ; GENERIC-NEXT: .LBB16_2: ; %bb2 ; GENERIC-NEXT: s_endpgm @@ -5043,12 +5044,11 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; NOOPT-NEXT: s_mov_b32 s38, -1 ; NOOPT-NEXT: s_mov_b32 s39, 0xe8f000 -; NOOPT-NEXT: s_add_u32 s36, s36, s9 +; NOOPT-NEXT: s_add_u32 s36, s36, s11 ; NOOPT-NEXT: s_addc_u32 s37, s37, 0 -; NOOPT-NEXT: s_mov_b64 s[0:1], s[2:3] ; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:76 ; 4-byte Folded Spill -; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s8, s3 ; NOOPT-NEXT: s_mov_b32 s4, s2 @@ -5398,15 +5398,15 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; ; SI-MOVREL-LABEL: extract_vgpr_offset_multiple_in_block: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-MOVREL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-MOVREL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 ; SI-MOVREL-NEXT: s_mov_b32 s11, 0xf000 -; SI-MOVREL-NEXT: s_mov_b32 s6, 0 -; SI-MOVREL-NEXT: s_mov_b32 s7, s11 +; SI-MOVREL-NEXT: s_mov_b32 s2, 0 +; SI-MOVREL-NEXT: s_mov_b32 s3, s11 ; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 glc +; SI-MOVREL-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-MOVREL-NEXT: s_mov_b32 s10, -1 @@ -5490,7 +5490,7 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; ; VI-LABEL: extract_vgpr_offset_multiple_in_block: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s1 @@ -5498,7 +5498,7 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dword v2, v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: s_mov_b32 s4, 17 @@ -5582,8 +5582,8 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; ; GFX9-IDXMODE-LABEL: extract_vgpr_offset_multiple_in_block: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) @@ -5654,9 +5654,9 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 15, v4, s[0:1] ; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v0 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v0, 16, v3, s[0:1] -; GFX9-IDXMODE-NEXT: global_store_dword v1, v2, s[6:7] +; GFX9-IDXMODE-NEXT: global_store_dword v1, v2, s[2:3] ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-IDXMODE-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB16_2 @@ -5691,37 +5691,37 @@ bb2: define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) { ; GENERIC-LABEL: insert_vgpr_offset_multiple_in_block: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[24:25], s[2:3], 0xd -; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; GENERIC-NEXT: s_mov_b32 s23, 0xf000 +; GENERIC-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0xd +; GENERIC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s26, 0 ; GENERIC-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GENERIC-NEXT: v_mov_b32_e32 v2, 0 -; GENERIC-NEXT: s_mov_b32 s27, s23 +; GENERIC-NEXT: s_mov_b32 s27, s3 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) ; GENERIC-NEXT: buffer_load_dword v2, v[1:2], s[24:27], 0 addr64 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; GENERIC-NEXT: s_mov_b32 s22, -1 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: ;;#ASMSTART ; GENERIC-NEXT: v_mov_b32 v1, 62 ; GENERIC-NEXT: ;;#ASMEND -; GENERIC-NEXT: v_mov_b32_e32 v3, s16 -; GENERIC-NEXT: v_mov_b32_e32 v4, s17 -; GENERIC-NEXT: v_mov_b32_e32 v5, s18 -; GENERIC-NEXT: v_mov_b32_e32 v6, s19 -; GENERIC-NEXT: v_mov_b32_e32 v7, s12 -; GENERIC-NEXT: v_mov_b32_e32 v8, s13 -; GENERIC-NEXT: v_mov_b32_e32 v9, s14 -; GENERIC-NEXT: v_mov_b32_e32 v10, s15 -; GENERIC-NEXT: v_mov_b32_e32 v11, s8 -; GENERIC-NEXT: v_mov_b32_e32 v12, s9 -; GENERIC-NEXT: v_mov_b32_e32 v13, s10 -; GENERIC-NEXT: v_mov_b32_e32 v14, s11 -; GENERIC-NEXT: v_mov_b32_e32 v15, s4 -; GENERIC-NEXT: v_mov_b32_e32 v16, s5 -; GENERIC-NEXT: v_mov_b32_e32 v17, s6 -; GENERIC-NEXT: v_mov_b32_e32 v18, s7 +; GENERIC-NEXT: v_mov_b32_e32 v3, s20 +; GENERIC-NEXT: v_mov_b32_e32 v4, s21 +; GENERIC-NEXT: v_mov_b32_e32 v5, s22 +; GENERIC-NEXT: v_mov_b32_e32 v6, s23 +; GENERIC-NEXT: v_mov_b32_e32 v7, s16 +; GENERIC-NEXT: v_mov_b32_e32 v8, s17 +; GENERIC-NEXT: v_mov_b32_e32 v9, s18 +; GENERIC-NEXT: v_mov_b32_e32 v10, s19 +; GENERIC-NEXT: v_mov_b32_e32 v11, s12 +; GENERIC-NEXT: v_mov_b32_e32 v12, s13 +; GENERIC-NEXT: v_mov_b32_e32 v13, s14 +; GENERIC-NEXT: v_mov_b32_e32 v14, s15 +; GENERIC-NEXT: v_mov_b32_e32 v15, s8 +; GENERIC-NEXT: v_mov_b32_e32 v16, s9 +; GENERIC-NEXT: v_mov_b32_e32 v17, s10 +; GENERIC-NEXT: v_mov_b32_e32 v18, s11 ; GENERIC-NEXT: v_add_i32_e32 v19, vcc, 1, v2 ; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 ; GENERIC-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc @@ -5788,19 +5788,19 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 ; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 +; GENERIC-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 +; GENERIC-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16 +; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_store_dwordx4 v[2:5], off, s[20:23], 0 +; GENERIC-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 ; GENERIC-NEXT: s_waitcnt vmcnt(0) ; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GENERIC-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GENERIC-NEXT: s_cbranch_execz .LBB17_2 ; GENERIC-NEXT: ; %bb.1: ; %bb1 -; GENERIC-NEXT: buffer_store_dword v1, off, s[20:23], 0 +; GENERIC-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GENERIC-NEXT: s_waitcnt vmcnt(0) ; GENERIC-NEXT: .LBB17_2: ; %bb2 ; GENERIC-NEXT: s_endpgm @@ -5811,12 +5811,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_mov_b32 s29, SCRATCH_RSRC_DWORD1 ; NOOPT-NEXT: s_mov_b32 s30, -1 ; NOOPT-NEXT: s_mov_b32 s31, 0xe8f000 -; NOOPT-NEXT: s_add_u32 s28, s28, s9 +; NOOPT-NEXT: s_add_u32 s28, s28, s11 ; NOOPT-NEXT: s_addc_u32 s29, s29, 0 ; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Spill -; NOOPT-NEXT: s_load_dwordx2 s[18:19], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0xd -; NOOPT-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x19 +; NOOPT-NEXT: s_load_dwordx2 s[18:19], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xd +; NOOPT-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x19 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s24, s19 ; NOOPT-NEXT: s_mov_b32 s20, s18 @@ -6250,38 +6250,38 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; ; SI-MOVREL-LABEL: insert_vgpr_offset_multiple_in_block: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-MOVREL-NEXT: s_mov_b32 s23, 0xf000 -; SI-MOVREL-NEXT: s_mov_b32 s6, 0 -; SI-MOVREL-NEXT: s_mov_b32 s7, s23 +; SI-MOVREL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s10, 0 +; SI-MOVREL-NEXT: s_mov_b32 s11, s3 ; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dword v2, v[1:2], s[4:7], 0 addr64 glc +; SI-MOVREL-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-MOVREL-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: ;;#ASMSTART ; SI-MOVREL-NEXT: v_mov_b32 v1, 62 ; SI-MOVREL-NEXT: ;;#ASMEND -; SI-MOVREL-NEXT: s_mov_b32 s22, -1 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s16 -; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s17 -; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s18 -; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s19 -; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s12 -; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s13 -; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s14 -; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s15 -; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s8 -; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s9 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s10 -; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s11 -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s4 -; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s5 -; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s6 -; SI-MOVREL-NEXT: v_mov_b32_e32 v18, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s20 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s21 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s22 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s23 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v18, s11 ; SI-MOVREL-NEXT: v_add_i32_e32 v19, vcc, 1, v2 ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc @@ -6348,25 +6348,25 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[2:5], off, s[20:23], 0 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; SI-MOVREL-NEXT: s_cbranch_execz .LBB17_2 ; SI-MOVREL-NEXT: ; %bb.1: ; %bb1 -; SI-MOVREL-NEXT: buffer_store_dword v1, off, s[20:23], 0 +; SI-MOVREL-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: .LBB17_2: ; %bb2 ; SI-MOVREL-NEXT: s_endpgm ; ; VI-LABEL: insert_vgpr_offset_multiple_in_block: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s1 @@ -6374,28 +6374,28 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dword v2, v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: v_mov_b32 v1, 62 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s16 -; VI-NEXT: v_mov_b32_e32 v4, s17 -; VI-NEXT: v_mov_b32_e32 v5, s18 -; VI-NEXT: v_mov_b32_e32 v6, s19 -; VI-NEXT: v_mov_b32_e32 v7, s12 -; VI-NEXT: v_mov_b32_e32 v8, s13 -; VI-NEXT: v_mov_b32_e32 v9, s14 -; VI-NEXT: v_mov_b32_e32 v10, s15 -; VI-NEXT: v_mov_b32_e32 v11, s8 -; VI-NEXT: v_mov_b32_e32 v12, s9 -; VI-NEXT: v_mov_b32_e32 v13, s10 -; VI-NEXT: v_mov_b32_e32 v14, s11 -; VI-NEXT: v_mov_b32_e32 v15, s4 -; VI-NEXT: v_mov_b32_e32 v16, s5 -; VI-NEXT: v_mov_b32_e32 v17, s6 -; VI-NEXT: v_mov_b32_e32 v18, s7 +; VI-NEXT: v_mov_b32_e32 v3, s20 +; VI-NEXT: v_mov_b32_e32 v4, s21 +; VI-NEXT: v_mov_b32_e32 v5, s22 +; VI-NEXT: v_mov_b32_e32 v6, s23 +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: v_mov_b32_e32 v8, s17 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v12, s13 +; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v14, s15 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v18, s11 ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 1, v2 @@ -6494,34 +6494,34 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; ; GFX9-IDXMODE-LABEL: insert_vgpr_offset_multiple_in_block: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-IDXMODE-NEXT: global_load_dword v3, v1, s[0:1] glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: ;;#ASMSTART ; GFX9-IDXMODE-NEXT: v_mov_b32 v1, 62 ; GFX9-IDXMODE-NEXT: ;;#ASMEND ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s18 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s19 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s8 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s9 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s10 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s11 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s4 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s5 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, s6 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s7 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s19 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s11 ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v3 ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v3 @@ -6629,8 +6629,8 @@ bb2: define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) { ; GENERIC-LABEL: insert_w_offset_multiple_in_block: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41500000 @@ -6759,8 +6759,8 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; ; NOOPT-LABEL: insert_w_offset_multiple_in_block: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xb ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -6926,13 +6926,13 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; ; SI-MOVREL-LABEL: insert_w_offset_multiple_in_block: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s2, s4, 1 +; SI-MOVREL-NEXT: s_add_i32 s3, s2, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -6947,9 +6947,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 -; SI-MOVREL-NEXT: s_mov_b32 m0, s2 +; SI-MOVREL-NEXT: s_mov_b32 m0, s3 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 -; SI-MOVREL-NEXT: s_add_i32 s4, s4, 2 +; SI-MOVREL-NEXT: s_add_i32 s2, s2, 2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 ; SI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 ; SI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 @@ -6966,7 +6966,7 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v0 -; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 m0, s2 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32 @@ -6982,13 +6982,13 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; ; VI-MOVREL-LABEL: insert_w_offset_multiple_in_block: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_add_i32 s2, s4, 1 +; VI-MOVREL-NEXT: s_add_i32 s3, s2, 1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -7003,10 +7003,11 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 -; VI-MOVREL-NEXT: s_add_i32 s4, s4, 2 +; VI-MOVREL-NEXT: s_mov_b32 m0, s3 +; VI-MOVREL-NEXT: s_add_i32 s2, s2, 2 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 ; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 +; VI-MOVREL-NEXT: s_mov_b32 m0, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 ; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 @@ -7023,7 +7024,6 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-MOVREL-NEXT: v_mov_b32_e32 v18, v2 ; VI-MOVREL-NEXT: v_mov_b32_e32 v17, v1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v0 -; VI-MOVREL-NEXT: s_mov_b32 m0, s4 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32 ; VI-MOVREL-NEXT: v_mov_b32_e32 v33, s3 @@ -7068,14 +7068,14 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; ; VI-IDXMODE-LABEL: insert_w_offset_multiple_in_block: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; VI-IDXMODE-NEXT: s_add_i32 s2, s4, 1 +; VI-IDXMODE-NEXT: s_add_i32 s3, s2, 1 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 @@ -7088,13 +7088,12 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 -; VI-IDXMODE-NEXT: s_add_i32 s4, s4, 2 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, 0x41880000 -; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v32 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 -; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 2 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v30, v14 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v29, v13 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v28, v12 @@ -7110,10 +7109,11 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-IDXMODE-NEXT: v_mov_b32_e32 v18, v2 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, v1 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v0 -; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v32 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off +; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v33, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 @@ -7156,14 +7156,14 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; ; GFX9-IDXMODE-LABEL: insert_w_offset_multiple_in_block: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_add_i32 s2, s4, 1 +; GFX9-IDXMODE-NEXT: s_add_i32 s3, s2, 1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000 @@ -7177,11 +7177,11 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v32, 0x41880000 -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v32 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 2 +; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 2 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v30, v14 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v29, v13 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v28, v12 @@ -7197,7 +7197,7 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, v1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v32 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v32, 0 @@ -7227,7 +7227,7 @@ entry: define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; GENERIC-LABEL: extract_adjacent_blocks: ; GENERIC: ; %bb.0: ; %bb -; GENERIC-NEXT: s_load_dword s0, s[2:3], 0x9 +; GENERIC-NEXT: s_load_dword s0, s[4:5], 0x9 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) ; GENERIC-NEXT: s_cmp_lg_u32 s0, 0 ; GENERIC-NEXT: s_cbranch_scc0 .LBB19_4 @@ -7265,9 +7265,9 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; NOOPT-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; NOOPT-NEXT: s_mov_b32 s14, -1 ; NOOPT-NEXT: s_mov_b32 s15, 0xe8f000 -; NOOPT-NEXT: s_add_u32 s12, s12, s9 +; NOOPT-NEXT: s_add_u32 s12, s12, s11 ; NOOPT-NEXT: s_addc_u32 s13, s13, 0 -; NOOPT-NEXT: s_load_dword s2, s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s2, s[4:5], 0x9 ; NOOPT-NEXT: s_mov_b64 s[0:1], -1 ; NOOPT-NEXT: ; implicit-def: $sgpr3 ; NOOPT-NEXT: s_mov_b32 s3, 0 @@ -7363,7 +7363,7 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; ; SI-MOVREL-LABEL: extract_adjacent_blocks: ; SI-MOVREL: ; %bb.0: ; %bb -; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s0, s[4:5], 0x9 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; SI-MOVREL-NEXT: s_cmp_lg_u32 s0, 0 ; SI-MOVREL-NEXT: s_cbranch_scc0 .LBB19_4 @@ -7395,7 +7395,7 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; ; VI-LABEL: extract_adjacent_blocks: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB19_4 @@ -7421,7 +7421,7 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) { ; ; GFX9-IDXMODE-LABEL: extract_adjacent_blocks: ; GFX9-IDXMODE: ; %bb.0: ; %bb -; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-IDXMODE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-IDXMODE-NEXT: s_cbranch_scc0 .LBB19_4 @@ -7469,7 +7469,7 @@ bb7: define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; GENERIC-LABEL: insert_adjacent_blocks: ; GENERIC: ; %bb.0: ; %bb -; GENERIC-NEXT: s_load_dword s0, s[2:3], 0x9 +; GENERIC-NEXT: s_load_dword s0, s[4:5], 0x9 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) ; GENERIC-NEXT: s_cmp_lg_u32 s0, 0 ; GENERIC-NEXT: s_cbranch_scc0 .LBB20_4 @@ -7507,11 +7507,10 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; NOOPT-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 ; NOOPT-NEXT: s_mov_b32 s18, -1 ; NOOPT-NEXT: s_mov_b32 s19, 0xe8f000 -; NOOPT-NEXT: s_add_u32 s16, s16, s9 +; NOOPT-NEXT: s_add_u32 s16, s16, s11 ; NOOPT-NEXT: s_addc_u32 s17, s17, 0 -; NOOPT-NEXT: s_mov_b64 s[0:1], s[2:3] -; NOOPT-NEXT: s_load_dword s2, s[0:1], 0x9 -; NOOPT-NEXT: s_load_dword s0, s[0:1], 0xa +; NOOPT-NEXT: s_load_dword s2, s[4:5], 0x9 +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xa ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b64 s[0:1], -1 ; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 @@ -7612,7 +7611,7 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; ; SI-MOVREL-LABEL: insert_adjacent_blocks: ; SI-MOVREL: ; %bb.0: ; %bb -; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s0, s[4:5], 0x9 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; SI-MOVREL-NEXT: s_cmp_lg_u32 s0, 0 ; SI-MOVREL-NEXT: s_cbranch_scc0 .LBB20_4 @@ -7644,7 +7643,7 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; ; VI-LABEL: insert_adjacent_blocks: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB20_4 @@ -7670,7 +7669,7 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) { ; ; GFX9-IDXMODE-LABEL: insert_adjacent_blocks: ; GFX9-IDXMODE: ; %bb.0: ; %bb -; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-IDXMODE-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-IDXMODE-NEXT: s_cbranch_scc0 .LBB20_4 @@ -7719,7 +7718,7 @@ bb7: define amdgpu_kernel void @multi_same_block(i32 %arg) { ; GENERIC-LABEL: multi_same_block: ; GENERIC: ; %bb.0: ; %bb -; GENERIC-NEXT: s_load_dword s0, s[2:3], 0x9 +; GENERIC-NEXT: s_load_dword s0, s[4:5], 0x9 ; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41900000 ; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41b0cccd ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) @@ -7737,7 +7736,7 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) { ; ; NOOPT-LABEL: multi_same_block: ; NOOPT: ; %bb.0: ; %bb -; NOOPT-NEXT: s_load_dword s0, s[2:3], 0x9 +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0x9 ; NOOPT-NEXT: s_mov_b32 s8, 0x41900000 ; NOOPT-NEXT: ; implicit-def: $sgpr9 ; NOOPT-NEXT: ; implicit-def: $sgpr1 @@ -7829,7 +7828,7 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) { ; ; SI-MOVREL-LABEL: multi_same_block: ; SI-MOVREL: ; %bb.0: ; %bb -; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s0, s[4:5], 0x9 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 0x41900000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41b0cccd ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) @@ -7843,7 +7842,7 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) { ; ; VI-MOVREL-LABEL: multi_same_block: ; VI-MOVREL: ; %bb.0: ; %bb -; VI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 0x41900000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41b0cccd ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) @@ -7857,7 +7856,7 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) { ; ; VI-IDXMODE-LABEL: multi_same_block: ; VI-IDXMODE: ; %bb.0: ; %bb -; VI-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 0x41900000 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41b0cccd ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) @@ -7873,7 +7872,7 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) { ; ; GFX9-IDXMODE-LABEL: multi_same_block: ; GFX9-IDXMODE: ; %bb.0: ; %bb -; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 0x41900000 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41b0cccd ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) @@ -7903,24 +7902,24 @@ bb: define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) { ; GENERIC-LABEL: extract_largest_inbounds_offset: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GENERIC-NEXT: s_mov_b32 s7, 0xf000 -; GENERIC-NEXT: s_mov_b32 s6, -1 -; GENERIC-NEXT: s_load_dword s12, s[2:3], 0xd -; GENERIC-NEXT: s_mov_b32 s2, s6 -; GENERIC-NEXT: s_mov_b32 s3, s7 +; GENERIC-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: s_load_dword s12, s[4:5], 0xd +; GENERIC-NEXT: s_mov_b32 s6, s2 +; GENERIC-NEXT: s_mov_b32 s7, s3 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_mov_b32 s4, s8 -; GENERIC-NEXT: s_mov_b32 s5, s9 -; GENERIC-NEXT: s_mov_b32 s0, s10 -; GENERIC-NEXT: s_mov_b32 s1, s11 -; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; GENERIC-NEXT: s_mov_b32 s0, s8 +; GENERIC-NEXT: s_mov_b32 s1, s9 +; GENERIC-NEXT: s_mov_b32 s4, s10 +; GENERIC-NEXT: s_mov_b32 s5, s11 +; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; GENERIC-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; GENERIC-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; GENERIC-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) ; GENERIC-NEXT: s_add_i32 s12, s12, 15 ; GENERIC-NEXT: s_cmp_eq_u32 s12, 1 @@ -7968,14 +7967,14 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc -; GENERIC-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GENERIC-NEXT: s_endpgm ; ; NOOPT-LABEL: extract_largest_inbounds_offset: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xb +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xd ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -8040,97 +8039,97 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; ; SI-MOVREL-LABEL: extract_largest_inbounds_offset: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-MOVREL-NEXT: s_load_dword s12, s[2:3], 0xd -; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 -; SI-MOVREL-NEXT: s_mov_b32 s6, -1 -; SI-MOVREL-NEXT: s_mov_b32 s2, s6 +; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-MOVREL-NEXT: s_load_dword s12, s[4:5], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: s_mov_b32 s6, s2 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_mov_b32 s0, s10 -; SI-MOVREL-NEXT: s_mov_b32 s1, s11 -; SI-MOVREL-NEXT: s_mov_b32 s3, s7 -; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_mov_b32 s4, s10 +; SI-MOVREL-NEXT: s_mov_b32 s5, s11 +; SI-MOVREL-NEXT: s_mov_b32 s7, s3 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: s_add_i32 s12, s12, 15 ; SI-MOVREL-NEXT: s_mov_b32 m0, s12 -; SI-MOVREL-NEXT: s_mov_b32 s4, s8 -; SI-MOVREL-NEXT: s_mov_b32 s5, s9 +; SI-MOVREL-NEXT: s_mov_b32 s0, s8 +; SI-MOVREL-NEXT: s_mov_b32 s1, s9 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 -; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-MOVREL-NEXT: s_endpgm ; ; VI-MOVREL-LABEL: extract_largest_inbounds_offset: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-MOVREL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-MOVREL-NEXT: s_load_dword s6, s[4:5], 0x34 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s6 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s3 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; VI-MOVREL-NEXT: s_add_u32 s0, s6, 48 -; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 -; VI-MOVREL-NEXT: s_add_u32 s0, s6, 32 -; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: s_add_u32 s4, s2, 48 +; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; VI-MOVREL-NEXT: s_add_u32 s4, s2, 32 +; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 -; VI-MOVREL-NEXT: s_add_u32 s0, s6, 16 -; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; VI-MOVREL-NEXT: s_addc_u32 s3, s3, 0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 -; VI-MOVREL-NEXT: s_add_i32 s2, s2, 15 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3 +; VI-MOVREL-NEXT: s_add_i32 s6, s6, 15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2 +; VI-MOVREL-NEXT: s_mov_b32 m0, s6 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1 ; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0 ; VI-MOVREL-NEXT: s_endpgm ; ; VI-IDXMODE-LABEL: extract_largest_inbounds_offset: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-IDXMODE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s6, s[4:5], 0x34 ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 48 -; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s6 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 -; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 32 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s7 -; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 +; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 -; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 16 -; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 +; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s3, 0 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s2 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5 -; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 15 -; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s1 +; VI-IDXMODE-NEXT: s_add_i32 s6, s6, 15 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: flat_store_dword v[16:17], v0 @@ -8138,23 +8137,23 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; ; GFX9-IDXMODE-LABEL: extract_largest_inbounds_offset: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:48 glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] offset:32 glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:16 glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: s_add_i32 s0, s0, 15 -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 15 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off -; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[4:5] +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] ; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load volatile <16 x i32>, ptr addrspace(1) %in @@ -8167,24 +8166,24 @@ entry: define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) { ; GENERIC-LABEL: extract_out_of_bounds_offset: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GENERIC-NEXT: s_mov_b32 s7, 0xf000 -; GENERIC-NEXT: s_mov_b32 s6, -1 -; GENERIC-NEXT: s_load_dword s12, s[2:3], 0xd -; GENERIC-NEXT: s_mov_b32 s2, s6 -; GENERIC-NEXT: s_mov_b32 s3, s7 +; GENERIC-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: s_load_dword s12, s[4:5], 0xd +; GENERIC-NEXT: s_mov_b32 s6, s2 +; GENERIC-NEXT: s_mov_b32 s7, s3 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_mov_b32 s4, s8 -; GENERIC-NEXT: s_mov_b32 s5, s9 -; GENERIC-NEXT: s_mov_b32 s0, s10 -; GENERIC-NEXT: s_mov_b32 s1, s11 -; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; GENERIC-NEXT: s_mov_b32 s0, s8 +; GENERIC-NEXT: s_mov_b32 s1, s9 +; GENERIC-NEXT: s_mov_b32 s4, s10 +; GENERIC-NEXT: s_mov_b32 s5, s11 +; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; GENERIC-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; GENERIC-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; GENERIC-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) ; GENERIC-NEXT: s_add_i32 s12, s12, 16 ; GENERIC-NEXT: s_cmp_eq_u32 s12, 1 @@ -8232,14 +8231,14 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc -; GENERIC-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GENERIC-NEXT: s_endpgm ; ; NOOPT-LABEL: extract_out_of_bounds_offset: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xb +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xd ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -8304,97 +8303,97 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; ; SI-MOVREL-LABEL: extract_out_of_bounds_offset: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-MOVREL-NEXT: s_load_dword s12, s[2:3], 0xd -; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 -; SI-MOVREL-NEXT: s_mov_b32 s6, -1 -; SI-MOVREL-NEXT: s_mov_b32 s2, s6 +; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-MOVREL-NEXT: s_load_dword s12, s[4:5], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: s_mov_b32 s6, s2 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_mov_b32 s0, s10 -; SI-MOVREL-NEXT: s_mov_b32 s1, s11 -; SI-MOVREL-NEXT: s_mov_b32 s3, s7 -; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_mov_b32 s4, s10 +; SI-MOVREL-NEXT: s_mov_b32 s5, s11 +; SI-MOVREL-NEXT: s_mov_b32 s7, s3 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: s_add_i32 s12, s12, 16 ; SI-MOVREL-NEXT: s_mov_b32 m0, s12 -; SI-MOVREL-NEXT: s_mov_b32 s4, s8 -; SI-MOVREL-NEXT: s_mov_b32 s5, s9 +; SI-MOVREL-NEXT: s_mov_b32 s0, s8 +; SI-MOVREL-NEXT: s_mov_b32 s1, s9 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 -; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-MOVREL-NEXT: s_endpgm ; ; VI-MOVREL-LABEL: extract_out_of_bounds_offset: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-MOVREL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-MOVREL-NEXT: s_load_dword s6, s[4:5], 0x34 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s6 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s3 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; VI-MOVREL-NEXT: s_add_u32 s0, s6, 48 -; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 -; VI-MOVREL-NEXT: s_add_u32 s0, s6, 32 -; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: s_add_u32 s4, s2, 48 +; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; VI-MOVREL-NEXT: s_add_u32 s4, s2, 32 +; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 -; VI-MOVREL-NEXT: s_add_u32 s0, s6, 16 -; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; VI-MOVREL-NEXT: s_addc_u32 s3, s3, 0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 -; VI-MOVREL-NEXT: s_add_i32 s2, s2, 16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3 +; VI-MOVREL-NEXT: s_add_i32 s6, s6, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2 +; VI-MOVREL-NEXT: s_mov_b32 m0, s6 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1 ; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0 ; VI-MOVREL-NEXT: s_endpgm ; ; VI-IDXMODE-LABEL: extract_out_of_bounds_offset: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-IDXMODE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s6, s[4:5], 0x34 ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 48 -; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s6 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 -; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 32 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s7 -; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 +; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 -; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 16 -; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 +; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s3, 0 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s2 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5 -; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 16 -; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s1 +; VI-IDXMODE-NEXT: s_add_i32 s6, s6, 16 +; VI-IDXMODE-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: flat_store_dword v[16:17], v0 @@ -8402,23 +8401,23 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; ; GFX9-IDXMODE-LABEL: extract_out_of_bounds_offset: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:48 glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] offset:32 glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:16 glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: s_add_i32 s0, s0, 16 -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 16 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off -; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[4:5] +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] ; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load volatile <16 x i32>, ptr addrspace(1) %in @@ -8431,80 +8430,80 @@ entry: define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx.in) { ; GENERIC-LABEL: extractelement_v16i32_or_index: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GENERIC-NEXT: s_mov_b32 s7, 0xf000 -; GENERIC-NEXT: s_mov_b32 s6, -1 -; GENERIC-NEXT: s_load_dword s12, s[2:3], 0xd -; GENERIC-NEXT: s_mov_b32 s2, s6 -; GENERIC-NEXT: s_mov_b32 s3, s7 +; GENERIC-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: s_load_dword s12, s[4:5], 0xd +; GENERIC-NEXT: s_mov_b32 s6, s2 +; GENERIC-NEXT: s_mov_b32 s7, s3 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_mov_b32 s4, s8 -; GENERIC-NEXT: s_mov_b32 s5, s9 -; GENERIC-NEXT: s_mov_b32 s0, s10 -; GENERIC-NEXT: s_mov_b32 s1, s11 -; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; GENERIC-NEXT: s_mov_b32 s0, s8 +; GENERIC-NEXT: s_mov_b32 s1, s9 +; GENERIC-NEXT: s_mov_b32 s4, s10 +; GENERIC-NEXT: s_mov_b32 s5, s11 +; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; GENERIC-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; GENERIC-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; GENERIC-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) -; GENERIC-NEXT: s_lshl_b32 s0, s12, 2 -; GENERIC-NEXT: s_or_b32 s0, s0, 1 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 1 +; GENERIC-NEXT: s_lshl_b32 s4, s12, 2 +; GENERIC-NEXT: s_or_b32 s4, s4, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 1 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 2 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 2 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 3 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 3 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 4 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 4 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 5 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 5 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 6 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 6 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 7 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 7 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 8 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 8 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 9 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 9 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 10 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 10 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 11 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 11 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 12 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 12 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 13 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 13 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 14 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 14 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s0, 15 +; GENERIC-NEXT: s_cmp_eq_u32 s4, 15 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc -; GENERIC-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GENERIC-NEXT: s_endpgm ; ; NOOPT-LABEL: extractelement_v16i32_or_index: ; NOOPT: ; %bb.0: ; %entry -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xb +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0xd ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -8569,96 +8568,96 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ; ; SI-MOVREL-LABEL: extractelement_v16i32_or_index: ; SI-MOVREL: ; %bb.0: ; %entry -; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-MOVREL-NEXT: s_load_dword s12, s[2:3], 0xd -; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000 -; SI-MOVREL-NEXT: s_mov_b32 s6, -1 -; SI-MOVREL-NEXT: s_mov_b32 s2, s6 +; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-MOVREL-NEXT: s_load_dword s12, s[4:5], 0xd +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 +; SI-MOVREL-NEXT: s_mov_b32 s6, s2 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_mov_b32 s0, s10 -; SI-MOVREL-NEXT: s_mov_b32 s1, s11 -; SI-MOVREL-NEXT: s_mov_b32 s3, s7 -; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc +; SI-MOVREL-NEXT: s_mov_b32 s4, s10 +; SI-MOVREL-NEXT: s_mov_b32 s5, s11 +; SI-MOVREL-NEXT: s_mov_b32 s7, s3 +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc +; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: s_lshl_b32 s0, s12, 2 -; SI-MOVREL-NEXT: s_mov_b32 m0, s0 -; SI-MOVREL-NEXT: s_mov_b32 s4, s8 -; SI-MOVREL-NEXT: s_mov_b32 s5, s9 +; SI-MOVREL-NEXT: s_lshl_b32 s4, s12, 2 +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 s0, s8 +; SI-MOVREL-NEXT: s_mov_b32 s1, s9 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1 -; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-MOVREL-NEXT: s_endpgm ; ; VI-MOVREL-LABEL: extractelement_v16i32_or_index: ; VI-MOVREL: ; %bb.0: ; %entry -; VI-MOVREL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-MOVREL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-MOVREL-NEXT: s_load_dword s6, s[4:5], 0x34 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s6 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s3 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; VI-MOVREL-NEXT: s_add_u32 s0, s6, 48 -; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 -; VI-MOVREL-NEXT: s_add_u32 s0, s6, 32 -; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: s_add_u32 s4, s2, 48 +; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; VI-MOVREL-NEXT: s_add_u32 s4, s2, 32 +; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 -; VI-MOVREL-NEXT: s_add_u32 s0, s6, 16 -; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; VI-MOVREL-NEXT: s_addc_u32 s3, s3, 0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 -; VI-MOVREL-NEXT: s_lshl_b32 s0, s2, 2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0 +; VI-MOVREL-NEXT: s_lshl_b32 s0, s6, 2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2 ; VI-MOVREL-NEXT: s_mov_b32 m0, s0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1 ; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1 ; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0 ; VI-MOVREL-NEXT: s_endpgm ; ; VI-IDXMODE-LABEL: extractelement_v16i32_or_index: ; VI-IDXMODE: ; %bb.0: ; %entry -; VI-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34 +; VI-IDXMODE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s6, s[4:5], 0x34 ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 48 -; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s6 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 -; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 32 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s7 -; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 48 +; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 +; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 -; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 16 -; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 +; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 +; VI-IDXMODE-NEXT: s_addc_u32 s3, s3, 0 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s2 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5 -; VI-IDXMODE-NEXT: s_lshl_b32 s0, s2, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s1 +; VI-IDXMODE-NEXT: s_lshl_b32 s0, s6, 2 ; VI-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v1 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off @@ -8667,23 +8666,23 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ; ; GFX9-IDXMODE-LABEL: extractelement_v16i32_or_index: ; GFX9-IDXMODE: ; %bb.0: ; %entry -; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-IDXMODE-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:48 glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] offset:32 glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:16 glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] glc +; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: s_lshl_b32 s0, s0, 2 -; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-IDXMODE-NEXT: s_lshl_b32 s2, s4, 2 +; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off -; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[4:5] +; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1] ; GFX9-IDXMODE-NEXT: s_endpgm entry: %ld = load volatile <16 x i32>, ptr addrspace(1) %in @@ -8697,31 +8696,31 @@ entry: define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind { ; GENERIC-LABEL: insertelement_v16f32_or_index: ; GENERIC: ; %bb.0: -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; GENERIC-NEXT: s_load_dword s20, s[2:3], 0x29 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; GENERIC-NEXT: s_load_dword s4, s[4:5], 0x29 ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40a00000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_lshl_b32 s20, s20, 2 -; GENERIC-NEXT: v_mov_b32_e32 v0, s7 -; GENERIC-NEXT: v_mov_b32_e32 v1, s6 -; GENERIC-NEXT: v_mov_b32_e32 v4, s5 -; GENERIC-NEXT: v_mov_b32_e32 v5, s4 -; GENERIC-NEXT: v_mov_b32_e32 v6, s11 -; GENERIC-NEXT: v_mov_b32_e32 v8, s10 -; GENERIC-NEXT: v_mov_b32_e32 v9, s9 -; GENERIC-NEXT: v_mov_b32_e32 v11, s8 -; GENERIC-NEXT: v_mov_b32_e32 v12, s15 -; GENERIC-NEXT: v_mov_b32_e32 v13, s14 -; GENERIC-NEXT: v_mov_b32_e32 v14, s13 -; GENERIC-NEXT: v_mov_b32_e32 v15, s12 -; GENERIC-NEXT: v_mov_b32_e32 v16, s19 -; GENERIC-NEXT: v_mov_b32_e32 v17, s18 -; GENERIC-NEXT: v_mov_b32_e32 v18, s17 -; GENERIC-NEXT: v_mov_b32_e32 v19, s16 -; GENERIC-NEXT: s_or_b32 s4, s20, 1 +; GENERIC-NEXT: s_lshl_b32 s4, s4, 2 +; GENERIC-NEXT: v_mov_b32_e32 v0, s11 +; GENERIC-NEXT: v_mov_b32_e32 v1, s10 +; GENERIC-NEXT: v_mov_b32_e32 v4, s9 +; GENERIC-NEXT: v_mov_b32_e32 v5, s8 +; GENERIC-NEXT: v_mov_b32_e32 v6, s15 +; GENERIC-NEXT: v_mov_b32_e32 v8, s14 +; GENERIC-NEXT: v_mov_b32_e32 v9, s13 +; GENERIC-NEXT: v_mov_b32_e32 v11, s12 +; GENERIC-NEXT: v_mov_b32_e32 v12, s19 +; GENERIC-NEXT: v_mov_b32_e32 v13, s18 +; GENERIC-NEXT: v_mov_b32_e32 v14, s17 +; GENERIC-NEXT: v_mov_b32_e32 v15, s16 +; GENERIC-NEXT: v_mov_b32_e32 v16, s23 +; GENERIC-NEXT: v_mov_b32_e32 v17, s22 +; GENERIC-NEXT: v_mov_b32_e32 v18, s21 +; GENERIC-NEXT: v_mov_b32_e32 v19, s20 +; GENERIC-NEXT: s_or_b32 s4, s4, 1 ; GENERIC-NEXT: s_cmp_lg_u32 s4, 3 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 ; GENERIC-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc @@ -8780,9 +8779,9 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; ; NOOPT-LABEL: insertelement_v16f32_or_index: ; NOOPT: ; %bb.0: -; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x19 -; NOOPT-NEXT: s_load_dword s4, s[2:3], 0x29 +; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; NOOPT-NEXT: s_load_dword s4, s[4:5], 0x29 ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: s_mov_b32 s7, s1 ; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -8856,64 +8855,64 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; ; SI-MOVREL-LABEL: insertelement_v16f32_or_index: ; SI-MOVREL: ; %bb.0: -; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x29 -; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-MOVREL-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dword s6, s[4:5], 0x29 +; SI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000 -; SI-MOVREL-NEXT: s_mov_b32 s23, 0xf000 +; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_lshl_b32 s0, s0, 2 -; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 -; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 -; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 -; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 -; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 -; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 -; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 -; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 -; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 -; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 -; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 -; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 -; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 -; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 -; SI-MOVREL-NEXT: s_mov_b32 m0, s0 -; SI-MOVREL-NEXT: s_mov_b32 s22, -1 +; SI-MOVREL-NEXT: s_lshl_b32 s4, s6, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s20 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s21 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s22 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s23 +; SI-MOVREL-NEXT: s_mov_b32 m0, s4 +; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:48 -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:32 -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 -; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-MOVREL-NEXT: s_endpgm ; ; VI-MOVREL-LABEL: insertelement_v16f32_or_index: ; VI-MOVREL: ; %bb.0: -; VI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0xa4 -; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dword s2, s[4:5], 0xa4 +; VI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_lshl_b32 s2, s20, 2 -; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 +; VI-MOVREL-NEXT: s_lshl_b32 s2, s2, 2 +; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 ; VI-MOVREL-NEXT: s_mov_b32 m0, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6 -; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7 -; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9 -; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10 -; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11 -; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12 -; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13 -; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14 -; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15 -; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17 -; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s18 -; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s19 +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 +; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 +; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 +; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 +; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s14 +; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 +; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s17 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s19 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s20 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s21 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s22 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s23 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 ; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 @@ -8939,28 +8938,28 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; ; VI-IDXMODE-LABEL: insertelement_v16f32_or_index: ; VI-IDXMODE: ; %bb.0: -; VI-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4 -; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0xa4 +; VI-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; VI-IDXMODE-NEXT: s_lshl_b32 s3, s20, 2 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s19 +; VI-IDXMODE-NEXT: s_lshl_b32 s3, s2, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s8 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s9 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s10 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s11 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s12 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s13 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s14 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s15 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s17 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s18 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s19 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s20 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s21 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s22 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s23 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, v16 @@ -8989,29 +8988,29 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; ; GFX9-IDXMODE-LABEL: insertelement_v16f32_or_index: ; GFX9-IDXMODE: ; %bb.0: -; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4 -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0xa4 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x40a00000 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s18 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s19 -; GFX9-IDXMODE-NEXT: s_lshl_b32 s2, s20, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s19 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s20 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s21 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-IDXMODE-NEXT: s_lshl_b32 s2, s2, 2 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v17 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off @@ -9030,7 +9029,7 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; GENERIC-LABEL: broken_phi_bb: ; GENERIC: ; %bb.0: ; %bb -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GENERIC-NEXT: s_mov_b32 s6, 8 ; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_mov_b32 s2, -1 @@ -9062,10 +9061,10 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; NOOPT-NEXT: s_mov_b32 s26, -1 ; NOOPT-NEXT: s_mov_b32 s27, 0xe8f000 -; NOOPT-NEXT: s_add_u32 s24, s24, s9 +; NOOPT-NEXT: s_add_u32 s24, s24, s11 ; NOOPT-NEXT: s_addc_u32 s25, s25, 0 -; NOOPT-NEXT: s_load_dword s1, s[2:3], 0x9 -; NOOPT-NEXT: s_load_dword s0, s[2:3], 0xa +; NOOPT-NEXT: s_load_dword s1, s[4:5], 0x9 +; NOOPT-NEXT: s_load_dword s0, s[4:5], 0xa ; NOOPT-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane ; NOOPT-NEXT: s_waitcnt lgkmcnt(0) ; NOOPT-NEXT: v_writelane_b32 v18, s1, 0 @@ -9304,7 +9303,7 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; ; SI-MOVREL-LABEL: broken_phi_bb: ; SI-MOVREL: ; %bb.0: ; %bb -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 8 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 @@ -9342,7 +9341,7 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; ; VI-MOVREL-LABEL: broken_phi_bb: ; VI-MOVREL: ; %bb.0: ; %bb -; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 8 ; VI-MOVREL-NEXT: s_branch .LBB26_2 ; VI-MOVREL-NEXT: .LBB26_1: @@ -9378,7 +9377,7 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; ; VI-IDXMODE-LABEL: broken_phi_bb: ; VI-IDXMODE: ; %bb.0: ; %bb -; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 8 ; VI-IDXMODE-NEXT: s_branch .LBB26_2 ; VI-IDXMODE-NEXT: .LBB26_1: @@ -9415,7 +9414,7 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; ; GFX9-IDXMODE-LABEL: broken_phi_bb: ; GFX9-IDXMODE: ; %bb.0: ; %bb -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-IDXMODE-NEXT: s_branch .LBB26_2 ; GFX9-IDXMODE-NEXT: .LBB26_1: diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll index 44e8ae01fd6921..c664c5ccab4316 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -11,10 +11,10 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) { ; GCN-LABEL: name: extract_w_offset_vgpr ; GCN: bb.0.entry: ; GCN-NEXT: successors: %bb.1(0x80000000) - ; GCN-NEXT: liveins: $vgpr0, $sgpr2_sgpr3 + ; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 - ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) + ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: renamable $sgpr6 = COPY renamable $sgpr1 ; GCN-NEXT: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1 ; GCN-NEXT: renamable $sgpr4 = S_MOV_B32 61440 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index 44a2c34b06b574..3e6143866bf88a 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -11,68 +11,70 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX9-LABEL: indirect_call_known_no_special_inputs: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, wobble@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, wobble@gotpcrel32@hi+12 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, snork@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, snork@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[20:21], s[8:9], 0x0 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_mov_b64 s[14:15], 0 +; GFX9-NEXT: s_load_dword s17, s[14:15], 0x0 +; GFX9-NEXT: s_getpc_b64 s[14:15] +; GFX9-NEXT: s_add_u32 s14, s14, wobble@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s15, s15, wobble@gotpcrel32@hi+12 +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, snork@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, snork@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[22:23], s[14:15], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, 1, s15 -; GFX9-NEXT: s_cmp_eq_u32 s8, 1 +; GFX9-NEXT: s_and_b32 s14, 1, s17 +; GFX9-NEXT: s_cmp_eq_u32 s14, 1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_cselect_b32 s17, s21, s19 -; GFX9-NEXT: s_cselect_b32 s16, s20, s18 +; GFX9-NEXT: s_cselect_b32 s19, s23, s21 +; GFX9-NEXT: s_cselect_b32 s18, s22, s20 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-NEXT: s_mov_b32 s14, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: indirect_call_known_no_special_inputs: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX12-NEXT: s_getpc_b64 s[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_sext_i32_i16 s7, s7 ; GFX12-NEXT: s_add_co_u32 s6, s6, snork@gotpcrel32@lo+12 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork@gotpcrel32@hi+24 -; GFX12-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX12-NEXT: s_mov_b64 s[4:5], 0 -; GFX12-NEXT: s_getpc_b64 s[8:9] +; GFX12-NEXT: s_getpc_b64 s[12:13] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_sext_i32_i16 s9, s9 -; GFX12-NEXT: s_add_co_u32 s8, s8, wobble@gotpcrel32@lo+12 +; GFX12-NEXT: s_sext_i32_i16 s13, s13 +; GFX12-NEXT: s_add_co_u32 s12, s12, wobble@gotpcrel32@lo+12 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_ci_u32 s9, s9, wobble@gotpcrel32@hi+24 -; GFX12-NEXT: s_load_u8 s12, s[4:5], 0x0 +; GFX12-NEXT: s_add_co_ci_u32 s13, s13, wobble@gotpcrel32@hi+24 +; GFX12-NEXT: s_load_u8 s14, s[4:5], 0x0 ; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 -; GFX12-NEXT: s_load_b64 s[6:7], s[8:9], 0x0 +; GFX12-NEXT: s_load_b64 s[6:7], s[12:13], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 ; GFX12-NEXT: v_mov_b32_e32 v31, v0 ; GFX12-NEXT: s_mov_b32 s32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_and_b32 s8, 1, s12 +; GFX12-NEXT: s_and_b32 s12, 1, s14 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_cmp_eq_u32 s8, 1 -; GFX12-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX12-NEXT: s_cselect_b32 s7, s7, s5 -; GFX12-NEXT: s_cselect_b32 s6, s6, s4 +; GFX12-NEXT: s_cmp_eq_u32 s12, 1 +; GFX12-NEXT: s_cselect_b32 s13, s7, s5 +; GFX12-NEXT: s_cselect_b32 s12, s6, s4 ; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index f54a511eff7f1d..b2fd4015d920a4 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -27,6 +27,7 @@ define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP]], label [[DUMMYRETURNBLOCK:%.*]] ; IR: DummyReturnBlock: ; IR-NEXT: ret void +; entry: br label %loop @@ -42,7 +43,7 @@ define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc ; SI-NEXT: s_cbranch_execz .LBB1_3 ; SI-NEXT: ; %bb.1: ; %loop.preheader -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -66,6 +67,7 @@ define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP]], label [[UNIFIEDRETURNBLOCK]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void +; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond = icmp eq i32 %tmp, 1 @@ -82,7 +84,7 @@ return: define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loops: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b64 s[2:3], -1 ; SI-NEXT: s_cbranch_scc1 .LBB2_4 ; SI-NEXT: ; %bb.1: @@ -129,6 +131,7 @@ define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP2]], label [[DUMMYRETURNBLOCK]] ; IR: DummyReturnBlock: ; IR-NEXT: ret void +; entry: br i1 undef, label %loop1, label %loop2 @@ -148,7 +151,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc ; SI-NEXT: s_cbranch_execz .LBB3_5 ; SI-NEXT: ; %bb.1: ; %outer_loop.preheader -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -189,6 +192,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; IR-NEXT: br i1 [[COND3]], label [[INNER_LOOP]], label [[OUTER_LOOP]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void +; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond1 = icmp ne i32 %tmp, 1 ; avoid following BB optimizing away through the domination diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 292722c2607add..cf9fdbdc343919 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,15 +8,15 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %11 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %12 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %12 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %9 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %9 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %10 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %10 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() @@ -27,15 +27,15 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %11 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %11 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %12 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %12 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %9 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %9 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %10 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %10 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6619145 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() @@ -46,15 +46,15 @@ define amdgpu_kernel void @v_input_output_i128() { define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %11 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %11 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %12 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %12 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %9 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %9 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %10 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %10 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index e7c77d3123e825..5c5a769178dd94 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -52,65 +52,67 @@ bb: define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) { ; GFX11-LABEL: f2: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s21, s[2:3], 0x24 +; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5] ; GFX11-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-NEXT: s_load_b32 s19, s[16:17], 0x24 ; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX11-NEXT: s_mov_b32 s20, 0 ; GFX11-NEXT: s_mov_b32 s0, -1 -; GFX11-NEXT: s_mov_b32 s20, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v0, s21, v0 +; GFX11-NEXT: v_mul_lo_u32 v0, s19, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB2_13 ; GFX11-NEXT: ; %bb.1: ; %bb14 -; GFX11-NEXT: s_load_b128 s[16:19], s[2:3], 0x2c +; GFX11-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c +; GFX11-NEXT: s_mov_b32 s18, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitcmp1_b32 s17, 0 -; GFX11-NEXT: s_cselect_b32 s22, -1, 0 -; GFX11-NEXT: s_bitcmp0_b32 s17, 0 -; GFX11-NEXT: s_mov_b32 s17, 0 +; GFX11-NEXT: s_bitcmp1_b32 s21, 0 +; GFX11-NEXT: s_cselect_b32 s24, -1, 0 +; GFX11-NEXT: s_bitcmp0_b32 s21, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX11-NEXT: ; %bb.2: ; %bb15 -; GFX11-NEXT: s_add_u32 s8, s2, 0x58 -; GFX11-NEXT: s_addc_u32 s9, s3, 0 +; GFX11-NEXT: s_add_u32 s8, s16, 0x58 +; GFX11-NEXT: s_addc_u32 s9, s17, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 ; GFX11-NEXT: s_mov_b32 s13, s14 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s23, s14 +; GFX11-NEXT: s_mov_b32 s21, s14 ; GFX11-NEXT: s_mov_b32 s14, s15 -; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_mov_b32 s14, s23 -; GFX11-NEXT: s_mov_b64 s[2:3], s[6:7] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: .LBB2_3: ; %Flow10 +; GFX11-NEXT: s_mov_b32 s14, s21 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-NEXT: s_branch .LBB2_12 +; GFX11-NEXT: .LBB2_3: +; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccnz .LBB2_12 -; GFX11-NEXT: ; %bb.4: ; %bb16 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x54 -; GFX11-NEXT: s_bitcmp1_b32 s19, 0 -; GFX11-NEXT: s_cselect_b32 s8, -1, 0 -; GFX11-NEXT: s_and_b32 s1, s19, 1 +; GFX11-NEXT: .LBB2_4: ; %bb16 +; GFX11-NEXT: s_load_b32 s0, s[16:17], 0x54 +; GFX11-NEXT: s_bitcmp1_b32 s23, 0 +; GFX11-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-NEXT: s_and_b32 s1, s23, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitcmp1_b32 s0, 0 ; GFX11-NEXT: s_mov_b32 s0, -1 -; GFX11-NEXT: s_cselect_b32 s7, -1, 0 +; GFX11-NEXT: s_cselect_b32 s8, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader -; GFX11-NEXT: s_load_b128 s[24:27], s[2:3], 0x44 +; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_hi_u32 s0, s25, s24 -; GFX11-NEXT: s_mul_i32 s1, s25, s24 +; GFX11-NEXT: s_mul_hi_u32 s0, s29, s28 +; GFX11-NEXT: s_mul_i32 s1, s29, s28 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 ; GFX11-NEXT: s_mov_b32 s1, 0 @@ -118,16 +120,16 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, s26 +; GFX11-NEXT: s_lshr_b32 s0, s0, s30 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s0, s0, s18 -; GFX11-NEXT: s_mul_i32 s0, s0, s16 +; GFX11-NEXT: s_mul_i32 s0, s0, s22 +; GFX11-NEXT: s_mul_i32 s0, s0, s20 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s21, s0 -; GFX11-NEXT: s_lshl_b64 s[18:19], s[0:1], 1 +; GFX11-NEXT: s_or_b32 s0, s19, s0 +; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1 ; GFX11-NEXT: s_mov_b32 s0, s1 -; GFX11-NEXT: global_load_u16 v1, v0, s[18:19] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s22 +; GFX11-NEXT: global_load_u16 v1, v0, s[20:21] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo @@ -136,30 +138,30 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-NEXT: v_readfirstlane_b32 s13, v0 ; GFX11-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX11-NEXT: s_and_b32 s1, s7, s1 +; GFX11-NEXT: s_and_b32 s1, s8, s1 ; GFX11-NEXT: s_and_b32 s1, s1, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s13, v2 -; GFX11-NEXT: s_cselect_b32 s1, s13, s9 -; GFX11-NEXT: s_and_b32 s9, 0xffff, s0 +; GFX11-NEXT: v_readfirstlane_b32 s19, v2 +; GFX11-NEXT: s_cselect_b32 s1, s19, s13 +; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 ; GFX11-NEXT: s_and_b32 s1, s1, 1 -; GFX11-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-NEXT: s_cselect_b32 s9, -1, 0 -; GFX11-NEXT: s_and_b32 s16, s8, exec_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9 -; GFX11-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cselect_b32 s13, -1, 0 +; GFX11-NEXT: s_and_b32 s20, s9, exec_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 +; GFX11-NEXT: v_readfirstlane_b32 s13, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s13, v2 -; GFX11-NEXT: s_cselect_b32 s9, s13, s9 +; GFX11-NEXT: v_readfirstlane_b32 s19, v2 +; GFX11-NEXT: s_cselect_b32 s13, s19, s13 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_bitcmp1_b32 s9, 0 -; GFX11-NEXT: s_cselect_b32 s9, 0x100, 0 -; GFX11-NEXT: s_or_b32 s0, s9, s0 +; GFX11-NEXT: s_bitcmp1_b32 s13, 0 +; GFX11-NEXT: s_cselect_b32 s13, 0x100, 0 +; GFX11-NEXT: s_or_b32 s0, s13, s0 ; GFX11-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-NEXT: ; %bb.7: ; %Flow ; GFX11-NEXT: s_mov_b32 s0, 0 @@ -168,24 +170,24 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccz .LBB2_12 ; GFX11-NEXT: ; %bb.9: -; GFX11-NEXT: s_xor_b32 s0, s7, -1 +; GFX11-NEXT: s_xor_b32 s0, s8, -1 ; GFX11-NEXT: .LBB2_10: ; %bb17 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccz .LBB2_10 ; GFX11-NEXT: ; %bb.11: ; %Flow6 -; GFX11-NEXT: s_mov_b32 s17, -1 +; GFX11-NEXT: s_mov_b32 s18, -1 ; GFX11-NEXT: .LBB2_12: ; %Flow11 -; GFX11-NEXT: s_and_b32 s6, s6, exec_lo -; GFX11-NEXT: s_or_not1_b32 s0, s17, exec_lo +; GFX11-NEXT: s_and_b32 s20, s2, exec_lo +; GFX11-NEXT: s_or_not1_b32 s0, s18, exec_lo ; GFX11-NEXT: .LBB2_13: ; %Flow9 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s20 -; GFX11-NEXT: s_and_saveexec_b32 s7, s0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-NEXT: s_and_saveexec_b32 s3, s0 ; GFX11-NEXT: s_cbranch_execz .LBB2_15 ; GFX11-NEXT: ; %bb.14: ; %bb43 -; GFX11-NEXT: s_add_u32 s8, s2, 0x58 -; GFX11-NEXT: s_addc_u32 s9, s3, 0 +; GFX11-NEXT: s_add_u32 s8, s16, 0x58 +; GFX11-NEXT: s_addc_u32 s9, s17, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12 @@ -194,10 +196,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_or_b32 s6, s6, exec_lo +; GFX11-NEXT: s_or_b32 s20, s20, exec_lo ; GFX11-NEXT: .LBB2_15: ; %Flow14 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; GFX11-NEXT: s_and_saveexec_b32 s0, s6 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11-NEXT: s_and_saveexec_b32 s0, s20 ; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock ; GFX11-NEXT: ; divergent unreachable ; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index ea18e0d9eeefbd..605a58125bb79a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -4,28 +4,28 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) { ; GCN-LABEL: float4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s8, 3 -; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 3 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_cmp_lg_u32 s6, 2 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -37,7 +37,7 @@ entry: define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float4_inselt_undef: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v2, v0 @@ -56,24 +56,24 @@ entry: define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) { ; GCN-LABEL: int4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s8, 3 -; GCN-NEXT: s_cselect_b32 s2, s7, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 -; GCN-NEXT: s_cselect_b32 s3, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 1 -; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_cmp_lg_u32 s6, 3 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 2 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 1 +; GCN-NEXT: s_cselect_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, 1 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -85,19 +85,19 @@ entry: define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) { ; GCN-LABEL: float2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm entry: @@ -109,21 +109,21 @@ entry: define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec, i32 %sel) { ; GCN-LABEL: float8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GCN-NEXT: s_load_dword s12, s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NEXT: v_mov_b32_e32 v6, s10 -; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: s_mov_b32 m0, s12 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_mov_b32_e32 v6, s14 +; GCN-NEXT: v_mov_b32_e32 v7, s15 ; GCN-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v8, s2 @@ -142,30 +142,30 @@ entry: define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %vec, i32 %sel) { ; GCN-LABEL: float16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s20, s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_add_u32 s2, s0, 48 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v17, s3 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NEXT: v_mov_b32_e32 v6, s10 -; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: v_mov_b32_e32 v8, s12 -; GCN-NEXT: v_mov_b32_e32 v9, s13 -; GCN-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NEXT: v_mov_b32_e32 v11, s15 -; GCN-NEXT: v_mov_b32_e32 v12, s16 -; GCN-NEXT: v_mov_b32_e32 v13, s17 -; GCN-NEXT: v_mov_b32_e32 v14, s18 -; GCN-NEXT: v_mov_b32_e32 v15, s19 -; GCN-NEXT: s_mov_b32 m0, s20 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_mov_b32_e32 v6, s14 +; GCN-NEXT: v_mov_b32_e32 v7, s15 +; GCN-NEXT: v_mov_b32_e32 v8, s16 +; GCN-NEXT: v_mov_b32_e32 v9, s17 +; GCN-NEXT: v_mov_b32_e32 v10, s18 +; GCN-NEXT: v_mov_b32_e32 v11, s19 +; GCN-NEXT: v_mov_b32_e32 v12, s20 +; GCN-NEXT: v_mov_b32_e32 v13, s21 +; GCN-NEXT: v_mov_b32_e32 v14, s22 +; GCN-NEXT: v_mov_b32_e32 v15, s23 +; GCN-NEXT: s_mov_b32 m0, s4 ; GCN-NEXT: v_mov_b32_e32 v16, s2 ; GCN-NEXT: s_add_u32 s2, s0, 32 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 @@ -195,18 +195,18 @@ entry: define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %vec, i32 %sel) { ; GCN-LABEL: float32_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xe4 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x124 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x124 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v33, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NEXT: v_mov_b32_e32 v4, s40 ; GCN-NEXT: v_mov_b32_e32 v5, s41 @@ -220,22 +220,22 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-NEXT: v_mov_b32_e32 v13, s49 ; GCN-NEXT: v_mov_b32_e32 v14, s50 ; GCN-NEXT: v_mov_b32_e32 v15, s51 -; GCN-NEXT: v_mov_b32_e32 v16, s4 -; GCN-NEXT: v_mov_b32_e32 v17, s5 -; GCN-NEXT: v_mov_b32_e32 v18, s6 -; GCN-NEXT: v_mov_b32_e32 v19, s7 -; GCN-NEXT: v_mov_b32_e32 v20, s8 -; GCN-NEXT: v_mov_b32_e32 v21, s9 -; GCN-NEXT: v_mov_b32_e32 v22, s10 -; GCN-NEXT: v_mov_b32_e32 v23, s11 -; GCN-NEXT: v_mov_b32_e32 v24, s12 -; GCN-NEXT: v_mov_b32_e32 v25, s13 -; GCN-NEXT: v_mov_b32_e32 v26, s14 -; GCN-NEXT: v_mov_b32_e32 v27, s15 -; GCN-NEXT: v_mov_b32_e32 v28, s16 -; GCN-NEXT: v_mov_b32_e32 v29, s17 -; GCN-NEXT: v_mov_b32_e32 v30, s18 -; GCN-NEXT: v_mov_b32_e32 v31, s19 +; GCN-NEXT: v_mov_b32_e32 v16, s8 +; GCN-NEXT: v_mov_b32_e32 v17, s9 +; GCN-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NEXT: v_mov_b32_e32 v20, s12 +; GCN-NEXT: v_mov_b32_e32 v21, s13 +; GCN-NEXT: v_mov_b32_e32 v22, s14 +; GCN-NEXT: v_mov_b32_e32 v23, s15 +; GCN-NEXT: v_mov_b32_e32 v24, s16 +; GCN-NEXT: v_mov_b32_e32 v25, s17 +; GCN-NEXT: v_mov_b32_e32 v26, s18 +; GCN-NEXT: v_mov_b32_e32 v27, s19 +; GCN-NEXT: v_mov_b32_e32 v28, s20 +; GCN-NEXT: v_mov_b32_e32 v29, s21 +; GCN-NEXT: v_mov_b32_e32 v30, s22 +; GCN-NEXT: v_mov_b32_e32 v31, s23 ; GCN-NEXT: v_mov_b32_e32 v32, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 @@ -289,8 +289,8 @@ entry: define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) { ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -314,7 +314,7 @@ entry: define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) { ; GCN-LABEL: half2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 @@ -335,57 +335,57 @@ entry: define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, i32 %sel) { ; GCN-LABEL: half8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 7 -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: s_lshr_b32 s7, s3, 16 +; GCN-NEXT: s_cmp_lg_u32 s6, 7 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 6 +; GCN-NEXT: s_cmp_lg_u32 s6, 6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s6, 16 +; GCN-NEXT: s_lshr_b32 s3, s2, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 5 +; GCN-NEXT: s_cmp_lg_u32 s6, 5 ; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 4 +; GCN-NEXT: s_cmp_lg_u32 s6, 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s5, 16 +; GCN-NEXT: s_lshr_b32 s2, s1, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 3 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_cmp_lg_u32 s6, 2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s5 +; GCN-NEXT: v_mov_b32_e32 v4, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-NEXT: s_lshr_b32 s1, s0, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 1 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -397,7 +397,7 @@ entry: define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) { ; GCN-LABEL: short2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 @@ -418,8 +418,8 @@ entry: define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) { ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -443,10 +443,10 @@ entry: define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 3 +; GCN-NEXT: s_lshl_b32 s4, s6, 3 ; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 ; GCN-NEXT: s_and_b32 s7, s5, 0x1010101 ; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 @@ -467,96 +467,96 @@ entry: define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s7, 24 -; GCN-NEXT: s_cmp_lg_u32 s8, 15 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_lshr_b32 s3, s7, 16 -; GCN-NEXT: s_lshl_b32 s2, s2, 8 -; GCN-NEXT: s_cmp_lg_u32 s8, 14 +; GCN-NEXT: s_lshr_b32 s7, s3, 24 +; GCN-NEXT: s_cmp_lg_u32 s6, 15 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_lshr_b32 s8, s3, 16 +; GCN-NEXT: s_lshl_b32 s7, s7, 8 +; GCN-NEXT: s_cmp_lg_u32 s6, 14 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_and_b32 s8, s8, 0xff +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_lshr_b32 s9, s3, 8 +; GCN-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-NEXT: s_cmp_lg_u32 s6, 13 +; GCN-NEXT: s_cselect_b32 s8, s9, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NEXT: s_cmp_lg_u32 s6, 12 ; GCN-NEXT: s_cselect_b32 s3, s3, 1 ; GCN-NEXT: s_and_b32 s3, s3, 0xff -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_lshr_b32 s9, s7, 8 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 13 -; GCN-NEXT: s_cselect_b32 s3, s9, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 8 -; GCN-NEXT: s_cmp_lg_u32 s8, 12 -; GCN-NEXT: s_cselect_b32 s7, s7, 1 -; GCN-NEXT: s_and_b32 s7, s7, 0xff -; GCN-NEXT: s_or_b32 s3, s7, s3 +; GCN-NEXT: s_or_b32 s3, s3, s8 ; GCN-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_lshr_b32 s3, s6, 24 -; GCN-NEXT: s_cmp_lg_u32 s8, 11 -; GCN-NEXT: s_cselect_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 8 -; GCN-NEXT: s_lshr_b32 s7, s6, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 10 -; GCN-NEXT: s_cselect_b32 s7, s7, 1 -; GCN-NEXT: s_and_b32 s7, s7, 0xff -; GCN-NEXT: s_or_b32 s3, s7, s3 -; GCN-NEXT: s_lshl_b32 s3, s3, 16 -; GCN-NEXT: s_lshr_b32 s7, s6, 8 -; GCN-NEXT: s_cmp_lg_u32 s8, 9 +; GCN-NEXT: s_or_b32 s3, s3, s7 +; GCN-NEXT: s_lshr_b32 s7, s2, 24 +; GCN-NEXT: s_cmp_lg_u32 s6, 11 ; GCN-NEXT: s_cselect_b32 s7, s7, 1 ; GCN-NEXT: s_lshl_b32 s7, s7, 8 -; GCN-NEXT: s_cmp_lg_u32 s8, 8 -; GCN-NEXT: s_cselect_b32 s6, s6, 1 -; GCN-NEXT: s_and_b32 s6, s6, 0xff -; GCN-NEXT: s_or_b32 s6, s6, s7 -; GCN-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NEXT: s_or_b32 s3, s6, s3 -; GCN-NEXT: s_lshr_b32 s6, s5, 24 -; GCN-NEXT: s_cmp_lg_u32 s8, 7 -; GCN-NEXT: s_cselect_b32 s6, s6, 1 -; GCN-NEXT: s_lshl_b32 s6, s6, 8 -; GCN-NEXT: s_lshr_b32 s7, s5, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 6 -; GCN-NEXT: s_cselect_b32 s7, s7, 1 -; GCN-NEXT: s_and_b32 s7, s7, 0xff -; GCN-NEXT: s_or_b32 s6, s7, s6 -; GCN-NEXT: s_lshl_b32 s6, s6, 16 -; GCN-NEXT: s_lshr_b32 s7, s5, 8 -; GCN-NEXT: s_cmp_lg_u32 s8, 5 +; GCN-NEXT: s_lshr_b32 s8, s2, 16 +; GCN-NEXT: s_cmp_lg_u32 s6, 10 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_and_b32 s8, s8, 0xff +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-NEXT: s_lshr_b32 s8, s2, 8 +; GCN-NEXT: s_cmp_lg_u32 s6, 9 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NEXT: s_cmp_lg_u32 s6, 8 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 +; GCN-NEXT: s_and_b32 s2, s2, 0xff +; GCN-NEXT: s_or_b32 s2, s2, s8 +; GCN-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NEXT: s_or_b32 s2, s2, s7 +; GCN-NEXT: s_lshr_b32 s7, s1, 24 +; GCN-NEXT: s_cmp_lg_u32 s6, 7 ; GCN-NEXT: s_cselect_b32 s7, s7, 1 ; GCN-NEXT: s_lshl_b32 s7, s7, 8 -; GCN-NEXT: s_cmp_lg_u32 s8, 4 -; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_and_b32 s5, s5, 0xff -; GCN-NEXT: s_or_b32 s5, s5, s7 -; GCN-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NEXT: s_or_b32 s5, s5, s6 -; GCN-NEXT: s_lshr_b32 s6, s4, 24 -; GCN-NEXT: s_cmp_lg_u32 s8, 3 -; GCN-NEXT: s_cselect_b32 s6, s6, 1 -; GCN-NEXT: s_lshl_b32 s6, s6, 8 -; GCN-NEXT: s_lshr_b32 s7, s4, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 -; GCN-NEXT: s_cselect_b32 s7, s7, 1 -; GCN-NEXT: s_and_b32 s7, s7, 0xff -; GCN-NEXT: s_or_b32 s6, s7, s6 -; GCN-NEXT: s_lshl_b32 s6, s6, 16 -; GCN-NEXT: s_lshr_b32 s7, s4, 8 -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_lshr_b32 s8, s1, 16 +; GCN-NEXT: s_cmp_lg_u32 s6, 6 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_and_b32 s8, s8, 0xff +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-NEXT: s_lshr_b32 s8, s1, 8 +; GCN-NEXT: s_cmp_lg_u32 s6, 5 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NEXT: s_cmp_lg_u32 s6, 4 +; GCN-NEXT: s_cselect_b32 s1, s1, 1 +; GCN-NEXT: s_and_b32 s1, s1, 0xff +; GCN-NEXT: s_or_b32 s1, s1, s8 +; GCN-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NEXT: s_or_b32 s1, s1, s7 +; GCN-NEXT: s_lshr_b32 s7, s0, 24 +; GCN-NEXT: s_cmp_lg_u32 s6, 3 ; GCN-NEXT: s_cselect_b32 s7, s7, 1 ; GCN-NEXT: s_lshl_b32 s7, s7, 8 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, 1 -; GCN-NEXT: s_and_b32 s4, s4, 0xff -; GCN-NEXT: s_or_b32 s4, s4, s7 -; GCN-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NEXT: s_or_b32 s4, s4, s6 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_lshr_b32 s8, s0, 16 +; GCN-NEXT: s_cmp_lg_u32 s6, 2 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_and_b32 s8, s8, 0xff +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-NEXT: s_lshr_b32 s8, s0, 8 +; GCN-NEXT: s_cmp_lg_u32 s6, 1 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, 1 +; GCN-NEXT: s_and_b32 s0, s0, 0xff +; GCN-NEXT: s_or_b32 s0, s0, s8 +; GCN-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NEXT: s_or_b32 s0, s0, s7 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -568,22 +568,22 @@ entry: define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) { ; GCN-LABEL: double2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s8, 1 -; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s7 -; GCN-NEXT: s_cselect_b32 s3, 0, s6 -; GCN-NEXT: s_cmp_eq_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5 -; GCN-NEXT: s_cselect_b32 s4, 0, s4 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_cmp_eq_u32 s6, 1 +; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s3 +; GCN-NEXT: s_cselect_b32 s2, 0, s2 +; GCN-NEXT: s_cmp_eq_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s1, 0x3ff00000, s1 +; GCN-NEXT: s_cselect_b32 s0, 0, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -595,10 +595,10 @@ entry: define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) { ; GCN-LABEL: double5_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s12, s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x84 -; GCN-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 +; GCN-NEXT: s_load_dword s12, s[4:5], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x84 +; GCN-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s12, 4 ; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9 @@ -649,28 +649,28 @@ entry: define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) { ; GCN-LABEL: double8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s20, s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s20, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NEXT: v_mov_b32_e32 v6, s10 -; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: v_mov_b32_e32 v8, s12 -; GCN-NEXT: v_mov_b32_e32 v9, s13 -; GCN-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NEXT: v_mov_b32_e32 v11, s15 -; GCN-NEXT: v_mov_b32_e32 v12, s16 -; GCN-NEXT: v_mov_b32_e32 v13, s17 -; GCN-NEXT: v_mov_b32_e32 v14, s18 -; GCN-NEXT: v_mov_b32_e32 v15, s19 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_mov_b32_e32 v6, s14 +; GCN-NEXT: v_mov_b32_e32 v7, s15 +; GCN-NEXT: v_mov_b32_e32 v8, s16 +; GCN-NEXT: v_mov_b32_e32 v9, s17 +; GCN-NEXT: v_mov_b32_e32 v10, s18 +; GCN-NEXT: v_mov_b32_e32 v11, s19 +; GCN-NEXT: v_mov_b32_e32 v12, s20 +; GCN-NEXT: v_mov_b32_e32 v13, s21 +; GCN-NEXT: v_mov_b32_e32 v14, s22 +; GCN-NEXT: v_mov_b32_e32 v15, s23 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 48 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 @@ -705,46 +705,47 @@ entry: define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %vec, i32 %sel) { ; GCN-LABEL: double7_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x94 -; GCN-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x84 -; GCN-NEXT: s_load_dword s2, s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x64 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x94 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x84 +; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 -; GCN-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NEXT: v_mov_b32_e32 v6, s10 -; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: v_mov_b32_e32 v8, s12 -; GCN-NEXT: v_mov_b32_e32 v9, s13 -; GCN-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0xa4 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_mov_b32_e32 v6, s14 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s0, s0, 1 +; GCN-NEXT: v_mov_b32_e32 v7, s15 +; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v10, s2 +; GCN-NEXT: v_mov_b32_e32 v11, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s16 ; GCN-NEXT: v_mov_b32_e32 v13, s17 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 -; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: s_add_u32 s0, s6, 16 ; GCN-NEXT: v_movreld_b32_e32 v1, v16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v15, s3 -; GCN-NEXT: v_mov_b32_e32 v14, s2 +; GCN-NEXT: s_addc_u32 s1, s7, 0 +; GCN-NEXT: v_mov_b32_e32 v15, s1 +; GCN-NEXT: v_mov_b32_e32 v14, s0 ; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-NEXT: s_add_u32 s2, s0, 48 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_add_u32 s0, s6, 48 +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_add_u32 s0, s0, 32 +; GCN-NEXT: s_addc_u32 s1, s7, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_add_u32 s0, s6, 32 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13] -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_addc_u32 s1, s7, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -758,15 +759,15 @@ entry: define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> %vec, i32 %sel) { ; GCN-LABEL: double16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x124 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xe4 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x124 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: s_lshl_b32 s0, s0, 1 ; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 @@ -782,22 +783,22 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-NEXT: v_mov_b32_e32 v13, s49 ; GCN-NEXT: v_mov_b32_e32 v14, s50 ; GCN-NEXT: v_mov_b32_e32 v15, s51 -; GCN-NEXT: v_mov_b32_e32 v16, s4 -; GCN-NEXT: v_mov_b32_e32 v17, s5 -; GCN-NEXT: v_mov_b32_e32 v18, s6 -; GCN-NEXT: v_mov_b32_e32 v19, s7 -; GCN-NEXT: v_mov_b32_e32 v20, s8 -; GCN-NEXT: v_mov_b32_e32 v21, s9 -; GCN-NEXT: v_mov_b32_e32 v22, s10 -; GCN-NEXT: v_mov_b32_e32 v23, s11 -; GCN-NEXT: v_mov_b32_e32 v24, s12 -; GCN-NEXT: v_mov_b32_e32 v25, s13 -; GCN-NEXT: v_mov_b32_e32 v26, s14 -; GCN-NEXT: v_mov_b32_e32 v27, s15 -; GCN-NEXT: v_mov_b32_e32 v28, s16 -; GCN-NEXT: v_mov_b32_e32 v29, s17 -; GCN-NEXT: v_mov_b32_e32 v30, s18 -; GCN-NEXT: v_mov_b32_e32 v31, s19 +; GCN-NEXT: v_mov_b32_e32 v16, s8 +; GCN-NEXT: v_mov_b32_e32 v17, s9 +; GCN-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NEXT: v_mov_b32_e32 v20, s12 +; GCN-NEXT: v_mov_b32_e32 v21, s13 +; GCN-NEXT: v_mov_b32_e32 v22, s14 +; GCN-NEXT: v_mov_b32_e32 v23, s15 +; GCN-NEXT: v_mov_b32_e32 v24, s16 +; GCN-NEXT: v_mov_b32_e32 v25, s17 +; GCN-NEXT: v_mov_b32_e32 v26, s18 +; GCN-NEXT: v_mov_b32_e32 v27, s19 +; GCN-NEXT: v_mov_b32_e32 v28, s20 +; GCN-NEXT: v_mov_b32_e32 v29, s21 +; GCN-NEXT: v_mov_b32_e32 v30, s22 +; GCN-NEXT: v_mov_b32_e32 v31, s23 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 @@ -856,35 +857,35 @@ entry: define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> %vec, i32 %sel) { ; GCN-LABEL: double15_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x114 -; GCN-NEXT: s_load_dwordx4 s[20:23], s[2:3], 0x104 -; GCN-NEXT: s_load_dwordx8 s[24:31], s[2:3], 0xe4 +; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x114 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x104 +; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0xe4 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_load_dword s4, s[2:3], 0x124 -; GCN-NEXT: v_mov_b32_e32 v28, s0 -; GCN-NEXT: v_mov_b32_e32 v29, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v24, s0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x124 +; GCN-NEXT: v_mov_b32_e32 v25, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s4, 1 +; GCN-NEXT: s_lshl_b32 s0, s0, 1 ; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NEXT: v_mov_b32_e32 v6, s10 -; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: v_mov_b32_e32 v8, s12 -; GCN-NEXT: v_mov_b32_e32 v9, s13 -; GCN-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NEXT: v_mov_b32_e32 v11, s15 -; GCN-NEXT: v_mov_b32_e32 v12, s16 -; GCN-NEXT: v_mov_b32_e32 v13, s17 -; GCN-NEXT: v_mov_b32_e32 v14, s18 -; GCN-NEXT: v_mov_b32_e32 v15, s19 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_mov_b32_e32 v6, s14 +; GCN-NEXT: v_mov_b32_e32 v7, s15 +; GCN-NEXT: v_mov_b32_e32 v8, s16 +; GCN-NEXT: v_mov_b32_e32 v9, s17 +; GCN-NEXT: v_mov_b32_e32 v10, s18 +; GCN-NEXT: v_mov_b32_e32 v11, s19 +; GCN-NEXT: v_mov_b32_e32 v12, s20 +; GCN-NEXT: v_mov_b32_e32 v13, s21 +; GCN-NEXT: v_mov_b32_e32 v14, s22 +; GCN-NEXT: v_mov_b32_e32 v15, s23 ; GCN-NEXT: v_mov_b32_e32 v16, s24 ; GCN-NEXT: v_mov_b32_e32 v17, s25 ; GCN-NEXT: v_mov_b32_e32 v18, s26 @@ -893,10 +894,10 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v21, s29 ; GCN-NEXT: v_mov_b32_e32 v22, s30 ; GCN-NEXT: v_mov_b32_e32 v23, s31 -; GCN-NEXT: v_mov_b32_e32 v24, s20 -; GCN-NEXT: v_mov_b32_e32 v25, s21 -; GCN-NEXT: v_mov_b32_e32 v26, s22 -; GCN-NEXT: v_mov_b32_e32 v27, s23 +; GCN-NEXT: v_mov_b32_e32 v26, s2 +; GCN-NEXT: v_mov_b32_e32 v27, s3 +; GCN-NEXT: v_mov_b32_e32 v28, s6 +; GCN-NEXT: v_mov_b32_e32 v29, s7 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s2, s0, 0x50 @@ -953,11 +954,11 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-LABEL: bit4_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s14, -1 ; GCN-NEXT: s_mov_b32 s15, 0xe80000 -; GCN-NEXT: s_add_u32 s12, s12, s9 +; GCN-NEXT: s_add_u32 s12, s12, s11 ; GCN-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bfe_u32 s6, s2, 0x10003 @@ -1007,836 +1008,836 @@ entry: define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit128_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GCN-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s9, s4, 0xf0001 -; GCN-NEXT: s_lshr_b32 s42, s5, 16 -; GCN-NEXT: v_writelane_b32 v6, s0, 0 -; GCN-NEXT: v_writelane_b32 v6, s1, 1 -; GCN-NEXT: s_lshr_b32 s0, s4, 16 -; GCN-NEXT: v_writelane_b32 v6, s0, 2 -; GCN-NEXT: s_lshr_b32 s0, s4, 17 -; GCN-NEXT: v_writelane_b32 v6, s0, 3 -; GCN-NEXT: s_lshr_b32 s0, s4, 18 -; GCN-NEXT: v_writelane_b32 v6, s0, 4 -; GCN-NEXT: s_lshr_b32 s0, s4, 19 -; GCN-NEXT: v_writelane_b32 v6, s0, 5 -; GCN-NEXT: s_lshr_b32 s0, s4, 20 -; GCN-NEXT: v_writelane_b32 v6, s0, 6 -; GCN-NEXT: s_lshr_b32 s0, s4, 21 -; GCN-NEXT: v_writelane_b32 v6, s0, 7 -; GCN-NEXT: s_lshr_b32 s0, s4, 22 -; GCN-NEXT: v_writelane_b32 v6, s0, 8 -; GCN-NEXT: s_lshr_b32 s0, s4, 23 -; GCN-NEXT: v_writelane_b32 v6, s0, 9 -; GCN-NEXT: s_lshr_b32 s0, s4, 24 -; GCN-NEXT: v_writelane_b32 v6, s0, 10 -; GCN-NEXT: s_lshr_b32 s0, s4, 25 -; GCN-NEXT: v_writelane_b32 v6, s0, 11 -; GCN-NEXT: s_lshr_b32 s0, s4, 26 -; GCN-NEXT: v_writelane_b32 v6, s0, 12 -; GCN-NEXT: s_lshr_b32 s0, s4, 27 -; GCN-NEXT: v_writelane_b32 v6, s0, 13 -; GCN-NEXT: s_lshr_b32 s0, s4, 28 -; GCN-NEXT: v_writelane_b32 v6, s0, 14 -; GCN-NEXT: s_lshr_b32 s0, s4, 29 -; GCN-NEXT: v_writelane_b32 v6, s0, 15 -; GCN-NEXT: s_lshr_b32 s0, s4, 30 -; GCN-NEXT: v_writelane_b32 v6, s0, 16 -; GCN-NEXT: s_lshr_b32 s0, s4, 31 -; GCN-NEXT: v_writelane_b32 v6, s0, 17 +; GCN-NEXT: s_bfe_u32 s9, s0, 0xf0001 +; GCN-NEXT: s_lshr_b32 s42, s1, 16 +; GCN-NEXT: v_writelane_b32 v6, s4, 0 +; GCN-NEXT: v_writelane_b32 v6, s5, 1 +; GCN-NEXT: s_lshr_b32 s4, s0, 16 +; GCN-NEXT: v_writelane_b32 v6, s4, 2 +; GCN-NEXT: s_lshr_b32 s4, s0, 17 +; GCN-NEXT: v_writelane_b32 v6, s4, 3 +; GCN-NEXT: s_lshr_b32 s4, s0, 18 +; GCN-NEXT: v_writelane_b32 v6, s4, 4 +; GCN-NEXT: s_lshr_b32 s4, s0, 19 +; GCN-NEXT: v_writelane_b32 v6, s4, 5 +; GCN-NEXT: s_lshr_b32 s4, s0, 20 +; GCN-NEXT: v_writelane_b32 v6, s4, 6 +; GCN-NEXT: s_lshr_b32 s4, s0, 21 +; GCN-NEXT: v_writelane_b32 v6, s4, 7 +; GCN-NEXT: s_lshr_b32 s4, s0, 22 +; GCN-NEXT: v_writelane_b32 v6, s4, 8 +; GCN-NEXT: s_lshr_b32 s4, s0, 23 +; GCN-NEXT: v_writelane_b32 v6, s4, 9 +; GCN-NEXT: s_lshr_b32 s4, s0, 24 +; GCN-NEXT: v_writelane_b32 v6, s4, 10 +; GCN-NEXT: s_lshr_b32 s4, s0, 25 +; GCN-NEXT: v_writelane_b32 v6, s4, 11 +; GCN-NEXT: s_lshr_b32 s4, s0, 26 +; GCN-NEXT: v_writelane_b32 v6, s4, 12 +; GCN-NEXT: s_lshr_b32 s4, s0, 27 +; GCN-NEXT: v_writelane_b32 v6, s4, 13 +; GCN-NEXT: s_lshr_b32 s4, s0, 28 +; GCN-NEXT: v_writelane_b32 v6, s4, 14 +; GCN-NEXT: s_lshr_b32 s4, s0, 29 +; GCN-NEXT: v_writelane_b32 v6, s4, 15 +; GCN-NEXT: s_lshr_b32 s4, s0, 30 +; GCN-NEXT: v_writelane_b32 v6, s4, 16 +; GCN-NEXT: s_lshr_b32 s4, s0, 31 +; GCN-NEXT: v_writelane_b32 v6, s4, 17 ; GCN-NEXT: v_writelane_b32 v6, s9, 18 -; GCN-NEXT: s_bfe_u32 s9, s4, 0xe0002 +; GCN-NEXT: s_bfe_u32 s9, s0, 0xe0002 ; GCN-NEXT: v_writelane_b32 v6, s9, 19 -; GCN-NEXT: s_bfe_u32 s9, s4, 0xd0003 +; GCN-NEXT: s_bfe_u32 s9, s0, 0xd0003 ; GCN-NEXT: v_writelane_b32 v6, s9, 20 -; GCN-NEXT: s_bfe_u32 s9, s4, 0xc0004 +; GCN-NEXT: s_bfe_u32 s9, s0, 0xc0004 ; GCN-NEXT: v_writelane_b32 v6, s9, 21 -; GCN-NEXT: s_bfe_u32 s9, s4, 0xb0005 +; GCN-NEXT: s_bfe_u32 s9, s0, 0xb0005 ; GCN-NEXT: v_writelane_b32 v6, s9, 22 -; GCN-NEXT: s_bfe_u32 s9, s4, 0xa0006 +; GCN-NEXT: s_bfe_u32 s9, s0, 0xa0006 ; GCN-NEXT: v_writelane_b32 v6, s9, 23 -; GCN-NEXT: s_bfe_u32 s9, s4, 0x90007 +; GCN-NEXT: s_bfe_u32 s9, s0, 0x90007 ; GCN-NEXT: v_writelane_b32 v6, s9, 24 -; GCN-NEXT: s_bfe_u32 s9, s4, 0x80008 +; GCN-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GCN-NEXT: v_writelane_b32 v6, s9, 25 -; GCN-NEXT: s_bfe_u32 s9, s4, 0x70009 +; GCN-NEXT: s_bfe_u32 s9, s0, 0x70009 ; GCN-NEXT: v_writelane_b32 v6, s9, 26 -; GCN-NEXT: s_bfe_u32 s9, s4, 0x6000a +; GCN-NEXT: s_bfe_u32 s9, s0, 0x6000a ; GCN-NEXT: v_writelane_b32 v6, s9, 27 -; GCN-NEXT: s_bfe_u32 s9, s4, 0x5000b +; GCN-NEXT: s_bfe_u32 s9, s0, 0x5000b ; GCN-NEXT: v_writelane_b32 v6, s9, 28 -; GCN-NEXT: s_bfe_u32 s9, s4, 0x4000c +; GCN-NEXT: s_bfe_u32 s9, s0, 0x4000c ; GCN-NEXT: v_writelane_b32 v6, s9, 29 -; GCN-NEXT: s_bfe_u32 s9, s4, 0x3000d +; GCN-NEXT: s_bfe_u32 s9, s0, 0x3000d ; GCN-NEXT: v_writelane_b32 v6, s9, 30 -; GCN-NEXT: s_bfe_u32 s9, s4, 0x2000e +; GCN-NEXT: s_bfe_u32 s9, s0, 0x2000e ; GCN-NEXT: v_writelane_b32 v6, s9, 31 -; GCN-NEXT: s_bfe_u32 s9, s4, 0x1000f +; GCN-NEXT: s_bfe_u32 s9, s0, 0x1000f ; GCN-NEXT: v_writelane_b32 v6, s9, 32 -; GCN-NEXT: s_bfe_u32 s9, s5, 0xf0001 -; GCN-NEXT: s_lshr_b32 s43, s5, 17 -; GCN-NEXT: s_lshr_b32 s45, s5, 18 -; GCN-NEXT: s_lshr_b32 s47, s5, 19 -; GCN-NEXT: s_lshr_b32 s50, s5, 20 -; GCN-NEXT: s_lshr_b32 s51, s5, 21 -; GCN-NEXT: s_lshr_b32 s53, s5, 22 -; GCN-NEXT: s_lshr_b32 s55, s5, 23 -; GCN-NEXT: s_lshr_b32 s58, s5, 24 -; GCN-NEXT: s_lshr_b32 s59, s5, 25 -; GCN-NEXT: s_lshr_b32 s61, s5, 26 -; GCN-NEXT: s_lshr_b32 s63, s5, 27 -; GCN-NEXT: s_lshr_b32 s66, s5, 28 -; GCN-NEXT: s_lshr_b32 s67, s5, 29 -; GCN-NEXT: s_lshr_b32 s68, s5, 30 -; GCN-NEXT: s_lshr_b32 s69, s5, 31 -; GCN-NEXT: s_lshr_b32 s73, s6, 16 -; GCN-NEXT: s_lshr_b32 s74, s6, 17 -; GCN-NEXT: s_lshr_b32 s77, s6, 18 -; GCN-NEXT: s_lshr_b32 s78, s6, 19 -; GCN-NEXT: s_lshr_b32 s81, s6, 20 -; GCN-NEXT: s_lshr_b32 s82, s6, 21 -; GCN-NEXT: s_lshr_b32 s84, s6, 22 -; GCN-NEXT: s_lshr_b32 s86, s6, 23 -; GCN-NEXT: s_lshr_b32 s89, s6, 24 -; GCN-NEXT: s_lshr_b32 s90, s6, 25 -; GCN-NEXT: s_lshr_b32 s93, s6, 26 -; GCN-NEXT: s_lshr_b32 s94, s6, 27 -; GCN-NEXT: s_lshr_b32 vcc_hi, s6, 28 -; GCN-NEXT: s_lshr_b32 s39, s6, 29 -; GCN-NEXT: s_lshr_b32 s38, s6, 30 -; GCN-NEXT: s_lshr_b32 s37, s6, 31 -; GCN-NEXT: s_lshr_b32 s33, s7, 16 -; GCN-NEXT: s_lshr_b32 s31, s7, 17 -; GCN-NEXT: s_lshr_b32 s28, s7, 18 -; GCN-NEXT: s_lshr_b32 s27, s7, 19 -; GCN-NEXT: s_lshr_b32 s24, s7, 20 -; GCN-NEXT: s_lshr_b32 s23, s7, 21 -; GCN-NEXT: s_lshr_b32 s20, s7, 22 -; GCN-NEXT: s_lshr_b32 s19, s7, 23 -; GCN-NEXT: s_lshr_b32 s16, s7, 24 -; GCN-NEXT: s_lshr_b32 s15, s7, 25 -; GCN-NEXT: s_lshr_b32 s12, s7, 26 -; GCN-NEXT: s_lshr_b32 s11, s7, 27 -; GCN-NEXT: s_lshr_b32 s3, s7, 28 -; GCN-NEXT: s_lshr_b32 s2, s7, 29 -; GCN-NEXT: s_lshr_b32 s1, s7, 30 -; GCN-NEXT: s_lshr_b32 s0, s7, 31 +; GCN-NEXT: s_bfe_u32 s9, s1, 0xf0001 +; GCN-NEXT: s_lshr_b32 s43, s1, 17 +; GCN-NEXT: s_lshr_b32 s45, s1, 18 +; GCN-NEXT: s_lshr_b32 s47, s1, 19 +; GCN-NEXT: s_lshr_b32 s50, s1, 20 +; GCN-NEXT: s_lshr_b32 s51, s1, 21 +; GCN-NEXT: s_lshr_b32 s53, s1, 22 +; GCN-NEXT: s_lshr_b32 s55, s1, 23 +; GCN-NEXT: s_lshr_b32 s58, s1, 24 +; GCN-NEXT: s_lshr_b32 s59, s1, 25 +; GCN-NEXT: s_lshr_b32 s61, s1, 26 +; GCN-NEXT: s_lshr_b32 s63, s1, 27 +; GCN-NEXT: s_lshr_b32 s66, s1, 28 +; GCN-NEXT: s_lshr_b32 s67, s1, 29 +; GCN-NEXT: s_lshr_b32 s68, s1, 30 +; GCN-NEXT: s_lshr_b32 s69, s1, 31 +; GCN-NEXT: s_lshr_b32 s73, s2, 16 +; GCN-NEXT: s_lshr_b32 s74, s2, 17 +; GCN-NEXT: s_lshr_b32 s77, s2, 18 +; GCN-NEXT: s_lshr_b32 s78, s2, 19 +; GCN-NEXT: s_lshr_b32 s81, s2, 20 +; GCN-NEXT: s_lshr_b32 s82, s2, 21 +; GCN-NEXT: s_lshr_b32 s84, s2, 22 +; GCN-NEXT: s_lshr_b32 s86, s2, 23 +; GCN-NEXT: s_lshr_b32 s89, s2, 24 +; GCN-NEXT: s_lshr_b32 s90, s2, 25 +; GCN-NEXT: s_lshr_b32 s93, s2, 26 +; GCN-NEXT: s_lshr_b32 s94, s2, 27 +; GCN-NEXT: s_lshr_b32 vcc_hi, s2, 28 +; GCN-NEXT: s_lshr_b32 s39, s2, 29 +; GCN-NEXT: s_lshr_b32 s38, s2, 30 +; GCN-NEXT: s_lshr_b32 s37, s2, 31 +; GCN-NEXT: s_lshr_b32 s33, s3, 16 +; GCN-NEXT: s_lshr_b32 s31, s3, 17 +; GCN-NEXT: s_lshr_b32 s28, s3, 18 +; GCN-NEXT: s_lshr_b32 s27, s3, 19 +; GCN-NEXT: s_lshr_b32 s24, s3, 20 +; GCN-NEXT: s_lshr_b32 s23, s3, 21 +; GCN-NEXT: s_lshr_b32 s20, s3, 22 +; GCN-NEXT: s_lshr_b32 s19, s3, 23 +; GCN-NEXT: s_lshr_b32 s16, s3, 24 +; GCN-NEXT: s_lshr_b32 s15, s3, 25 +; GCN-NEXT: s_lshr_b32 s12, s3, 26 +; GCN-NEXT: s_lshr_b32 s11, s3, 27 +; GCN-NEXT: s_lshr_b32 s8, s3, 28 +; GCN-NEXT: s_lshr_b32 s7, s3, 29 +; GCN-NEXT: s_lshr_b32 s5, s3, 30 +; GCN-NEXT: s_lshr_b32 s4, s3, 31 ; GCN-NEXT: v_writelane_b32 v6, s9, 33 -; GCN-NEXT: s_bfe_u32 s40, s5, 0xe0002 -; GCN-NEXT: s_bfe_u32 s41, s5, 0xd0003 -; GCN-NEXT: s_bfe_u32 s44, s5, 0xc0004 -; GCN-NEXT: s_bfe_u32 s46, s5, 0xb0005 -; GCN-NEXT: s_bfe_u32 s48, s5, 0xa0006 -; GCN-NEXT: s_bfe_u32 s49, s5, 0x90007 -; GCN-NEXT: s_bfe_u32 s52, s5, 0x80008 -; GCN-NEXT: s_bfe_u32 s54, s5, 0x70009 -; GCN-NEXT: s_bfe_u32 s56, s5, 0x6000a -; GCN-NEXT: s_bfe_u32 s57, s5, 0x5000b -; GCN-NEXT: s_bfe_u32 s60, s5, 0x4000c -; GCN-NEXT: s_bfe_u32 s62, s5, 0x3000d -; GCN-NEXT: s_bfe_u32 s64, s5, 0x2000e -; GCN-NEXT: s_bfe_u32 s65, s5, 0x1000f -; GCN-NEXT: s_bfe_u32 s70, s6, 0xf0001 -; GCN-NEXT: s_bfe_u32 s71, s6, 0xe0002 -; GCN-NEXT: s_bfe_u32 s72, s6, 0xd0003 -; GCN-NEXT: s_bfe_u32 s75, s6, 0xc0004 -; GCN-NEXT: s_bfe_u32 s76, s6, 0xb0005 -; GCN-NEXT: s_bfe_u32 s79, s6, 0xa0006 -; GCN-NEXT: s_bfe_u32 s80, s6, 0x90007 -; GCN-NEXT: s_bfe_u32 s83, s6, 0x80008 -; GCN-NEXT: s_bfe_u32 s85, s6, 0x70009 -; GCN-NEXT: s_bfe_u32 s87, s6, 0x6000a -; GCN-NEXT: s_bfe_u32 s88, s6, 0x5000b -; GCN-NEXT: s_bfe_u32 s91, s6, 0x4000c -; GCN-NEXT: s_bfe_u32 s92, s6, 0x3000d -; GCN-NEXT: s_bfe_u32 s95, s6, 0x2000e -; GCN-NEXT: s_bfe_u32 vcc_lo, s6, 0x1000f -; GCN-NEXT: s_bfe_u32 s36, s7, 0xf0001 -; GCN-NEXT: s_bfe_u32 s35, s7, 0xe0002 -; GCN-NEXT: s_bfe_u32 s34, s7, 0xd0003 -; GCN-NEXT: s_bfe_u32 s30, s7, 0xc0004 -; GCN-NEXT: s_bfe_u32 s29, s7, 0xb0005 -; GCN-NEXT: s_bfe_u32 s26, s7, 0xa0006 -; GCN-NEXT: s_bfe_u32 s25, s7, 0x90007 -; GCN-NEXT: s_bfe_u32 s22, s7, 0x80008 -; GCN-NEXT: s_bfe_u32 s21, s7, 0x70009 -; GCN-NEXT: s_bfe_u32 s18, s7, 0x6000a -; GCN-NEXT: s_bfe_u32 s17, s7, 0x5000b -; GCN-NEXT: s_bfe_u32 s14, s7, 0x4000c -; GCN-NEXT: s_bfe_u32 s13, s7, 0x3000d -; GCN-NEXT: s_bfe_u32 s10, s7, 0x2000e -; GCN-NEXT: s_bfe_u32 s9, s7, 0x1000f -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x7f -; GCN-NEXT: s_cselect_b32 s0, s0, 1 -; GCN-NEXT: s_lshl_b32 s0, s0, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x7e -; GCN-NEXT: s_cselect_b32 s1, s1, 1 -; GCN-NEXT: s_and_b32 s1, s1, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 2 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x7d -; GCN-NEXT: s_cselect_b32 s1, s2, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x7c -; GCN-NEXT: s_cselect_b32 s2, s3, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_and_b32 s1, s1, 3 -; GCN-NEXT: s_or_b32 s0, s1, s0 -; GCN-NEXT: s_lshl_b32 s0, s0, 12 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x7b -; GCN-NEXT: s_cselect_b32 s1, s11, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x7a -; GCN-NEXT: s_cselect_b32 s2, s12, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 2 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x79 -; GCN-NEXT: s_cselect_b32 s2, s15, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x78 -; GCN-NEXT: s_cselect_b32 s3, s16, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 3 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_and_b32 s1, s1, 15 -; GCN-NEXT: s_lshl_b32 s1, s1, 8 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x77 -; GCN-NEXT: s_cselect_b32 s1, s19, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x76 -; GCN-NEXT: s_cselect_b32 s2, s20, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 2 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x75 -; GCN-NEXT: s_cselect_b32 s2, s23, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x74 -; GCN-NEXT: s_cselect_b32 s3, s24, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 3 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s1, s1, 4 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x73 -; GCN-NEXT: s_cselect_b32 s2, s27, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x72 -; GCN-NEXT: s_cselect_b32 s3, s28, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 2 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x71 -; GCN-NEXT: s_cselect_b32 s3, s31, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x70 +; GCN-NEXT: s_bfe_u32 s40, s1, 0xe0002 +; GCN-NEXT: s_bfe_u32 s41, s1, 0xd0003 +; GCN-NEXT: s_bfe_u32 s44, s1, 0xc0004 +; GCN-NEXT: s_bfe_u32 s46, s1, 0xb0005 +; GCN-NEXT: s_bfe_u32 s48, s1, 0xa0006 +; GCN-NEXT: s_bfe_u32 s49, s1, 0x90007 +; GCN-NEXT: s_bfe_u32 s52, s1, 0x80008 +; GCN-NEXT: s_bfe_u32 s54, s1, 0x70009 +; GCN-NEXT: s_bfe_u32 s56, s1, 0x6000a +; GCN-NEXT: s_bfe_u32 s57, s1, 0x5000b +; GCN-NEXT: s_bfe_u32 s60, s1, 0x4000c +; GCN-NEXT: s_bfe_u32 s62, s1, 0x3000d +; GCN-NEXT: s_bfe_u32 s64, s1, 0x2000e +; GCN-NEXT: s_bfe_u32 s65, s1, 0x1000f +; GCN-NEXT: s_bfe_u32 s70, s2, 0xf0001 +; GCN-NEXT: s_bfe_u32 s71, s2, 0xe0002 +; GCN-NEXT: s_bfe_u32 s72, s2, 0xd0003 +; GCN-NEXT: s_bfe_u32 s75, s2, 0xc0004 +; GCN-NEXT: s_bfe_u32 s76, s2, 0xb0005 +; GCN-NEXT: s_bfe_u32 s79, s2, 0xa0006 +; GCN-NEXT: s_bfe_u32 s80, s2, 0x90007 +; GCN-NEXT: s_bfe_u32 s83, s2, 0x80008 +; GCN-NEXT: s_bfe_u32 s85, s2, 0x70009 +; GCN-NEXT: s_bfe_u32 s87, s2, 0x6000a +; GCN-NEXT: s_bfe_u32 s88, s2, 0x5000b +; GCN-NEXT: s_bfe_u32 s91, s2, 0x4000c +; GCN-NEXT: s_bfe_u32 s92, s2, 0x3000d +; GCN-NEXT: s_bfe_u32 s95, s2, 0x2000e +; GCN-NEXT: s_bfe_u32 vcc_lo, s2, 0x1000f +; GCN-NEXT: s_bfe_u32 s36, s3, 0xf0001 +; GCN-NEXT: s_bfe_u32 s35, s3, 0xe0002 +; GCN-NEXT: s_bfe_u32 s34, s3, 0xd0003 +; GCN-NEXT: s_bfe_u32 s30, s3, 0xc0004 +; GCN-NEXT: s_bfe_u32 s29, s3, 0xb0005 +; GCN-NEXT: s_bfe_u32 s26, s3, 0xa0006 +; GCN-NEXT: s_bfe_u32 s25, s3, 0x90007 +; GCN-NEXT: s_bfe_u32 s22, s3, 0x80008 +; GCN-NEXT: s_bfe_u32 s21, s3, 0x70009 +; GCN-NEXT: s_bfe_u32 s18, s3, 0x6000a +; GCN-NEXT: s_bfe_u32 s17, s3, 0x5000b +; GCN-NEXT: s_bfe_u32 s14, s3, 0x4000c +; GCN-NEXT: s_bfe_u32 s13, s3, 0x3000d +; GCN-NEXT: s_bfe_u32 s10, s3, 0x2000e +; GCN-NEXT: s_bfe_u32 s9, s3, 0x1000f +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7f +; GCN-NEXT: s_cselect_b32 s4, s4, 1 +; GCN-NEXT: s_lshl_b32 s4, s4, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7e +; GCN-NEXT: s_cselect_b32 s5, s5, 1 +; GCN-NEXT: s_and_b32 s5, s5, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 2 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7d +; GCN-NEXT: s_cselect_b32 s5, s7, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7c +; GCN-NEXT: s_cselect_b32 s7, s8, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_and_b32 s5, s5, 3 +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: s_lshl_b32 s4, s4, 12 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7b +; GCN-NEXT: s_cselect_b32 s5, s11, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7a +; GCN-NEXT: s_cselect_b32 s7, s12, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x79 +; GCN-NEXT: s_cselect_b32 s7, s15, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x78 +; GCN-NEXT: s_cselect_b32 s8, s16, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_and_b32 s5, s5, 15 +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x77 +; GCN-NEXT: s_cselect_b32 s5, s19, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x76 +; GCN-NEXT: s_cselect_b32 s7, s20, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x75 +; GCN-NEXT: s_cselect_b32 s7, s23, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x74 +; GCN-NEXT: s_cselect_b32 s8, s24, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_lshl_b32 s5, s5, 4 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x73 +; GCN-NEXT: s_cselect_b32 s7, s27, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x72 +; GCN-NEXT: s_cselect_b32 s8, s28, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 2 +; GCN-NEXT: s_or_b32 s7, s7, s8 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x71 +; GCN-NEXT: s_cselect_b32 s8, s31, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x70 ; GCN-NEXT: s_cselect_b32 s11, s33, 1 ; GCN-NEXT: s_and_b32 s11, s11, 1 -; GCN-NEXT: s_or_b32 s3, s11, s3 -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 15 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_and_b32 s1, s1, 0xff -; GCN-NEXT: s_or_b32 s0, s1, s0 -; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x6f -; GCN-NEXT: s_cselect_b32 s1, s9, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x6e -; GCN-NEXT: s_cselect_b32 s2, s10, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 2 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x6d -; GCN-NEXT: s_cselect_b32 s2, s13, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x6c -; GCN-NEXT: s_cselect_b32 s3, s14, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 3 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s1, s1, 12 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x6b -; GCN-NEXT: s_cselect_b32 s2, s17, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x6a -; GCN-NEXT: s_cselect_b32 s3, s18, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 2 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x69 -; GCN-NEXT: s_cselect_b32 s3, s21, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x68 +; GCN-NEXT: s_or_b32 s8, s11, s8 +; GCN-NEXT: s_and_b32 s8, s8, 3 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 15 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_and_b32 s5, s5, 0xff +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6f +; GCN-NEXT: s_cselect_b32 s5, s9, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6e +; GCN-NEXT: s_cselect_b32 s7, s10, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6d +; GCN-NEXT: s_cselect_b32 s7, s13, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6c +; GCN-NEXT: s_cselect_b32 s8, s14, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_lshl_b32 s5, s5, 12 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6b +; GCN-NEXT: s_cselect_b32 s7, s17, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6a +; GCN-NEXT: s_cselect_b32 s8, s18, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 2 +; GCN-NEXT: s_or_b32 s7, s7, s8 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x69 +; GCN-NEXT: s_cselect_b32 s8, s21, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x68 ; GCN-NEXT: s_cselect_b32 s9, s22, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s3, s9, s3 -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 15 -; GCN-NEXT: s_lshl_b32 s2, s2, 8 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x67 -; GCN-NEXT: s_cselect_b32 s2, s25, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x66 -; GCN-NEXT: s_cselect_b32 s3, s26, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 2 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x65 -; GCN-NEXT: s_cselect_b32 s3, s29, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x64 +; GCN-NEXT: s_or_b32 s8, s9, s8 +; GCN-NEXT: s_and_b32 s8, s8, 3 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 15 +; GCN-NEXT: s_lshl_b32 s7, s7, 8 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x67 +; GCN-NEXT: s_cselect_b32 s7, s25, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x66 +; GCN-NEXT: s_cselect_b32 s8, s26, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 2 +; GCN-NEXT: s_or_b32 s7, s7, s8 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x65 +; GCN-NEXT: s_cselect_b32 s8, s29, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x64 ; GCN-NEXT: s_cselect_b32 s9, s30, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s3, s9, s3 -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_lshl_b32 s2, s2, 4 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x63 -; GCN-NEXT: s_cselect_b32 s3, s34, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x62 +; GCN-NEXT: s_or_b32 s8, s9, s8 +; GCN-NEXT: s_and_b32 s8, s8, 3 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_lshl_b32 s7, s7, 4 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x63 +; GCN-NEXT: s_cselect_b32 s8, s34, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x62 ; GCN-NEXT: s_cselect_b32 s9, s35, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 ; GCN-NEXT: s_lshl_b32 s9, s9, 2 -; GCN-NEXT: s_or_b32 s3, s3, s9 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x60 -; GCN-NEXT: s_cselect_b32 s7, s7, 1 -; GCN-NEXT: s_and_b32 s7, s7, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x61 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x60 +; GCN-NEXT: s_cselect_b32 s3, s3, 1 +; GCN-NEXT: s_and_b32 s3, s3, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x61 ; GCN-NEXT: s_cselect_b32 s9, s36, 1 ; GCN-NEXT: s_lshl_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s7, s7, s9 -; GCN-NEXT: s_and_b32 s7, s7, 3 -; GCN-NEXT: s_or_b32 s3, s7, s3 +; GCN-NEXT: s_or_b32 s3, s3, s9 +; GCN-NEXT: s_and_b32 s3, s3, 3 +; GCN-NEXT: s_or_b32 s3, s3, s8 ; GCN-NEXT: s_and_b32 s3, s3, 15 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 0xff -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-NEXT: s_or_b32 s7, s1, s0 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x5f -; GCN-NEXT: s_cselect_b32 s0, s37, 1 -; GCN-NEXT: s_lshl_b32 s0, s0, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x5e -; GCN-NEXT: s_cselect_b32 s1, s38, 1 -; GCN-NEXT: s_and_b32 s1, s1, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 2 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x5d -; GCN-NEXT: s_cselect_b32 s1, s39, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x5c -; GCN-NEXT: s_cselect_b32 s2, vcc_hi, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_and_b32 s1, s1, 3 -; GCN-NEXT: s_or_b32 s0, s1, s0 -; GCN-NEXT: s_lshl_b32 s0, s0, 12 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x5b -; GCN-NEXT: s_cselect_b32 s1, s94, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x5a -; GCN-NEXT: s_cselect_b32 s2, s93, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 2 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x59 -; GCN-NEXT: s_cselect_b32 s2, s90, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x58 -; GCN-NEXT: s_cselect_b32 s3, s89, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 3 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_and_b32 s1, s1, 15 -; GCN-NEXT: s_lshl_b32 s1, s1, 8 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x57 -; GCN-NEXT: s_cselect_b32 s1, s86, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x56 -; GCN-NEXT: s_cselect_b32 s2, s84, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 2 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x55 -; GCN-NEXT: s_cselect_b32 s2, s82, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x54 -; GCN-NEXT: s_cselect_b32 s3, s81, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 3 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s1, s1, 4 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x53 -; GCN-NEXT: s_cselect_b32 s2, s78, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x52 -; GCN-NEXT: s_cselect_b32 s3, s77, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 2 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x51 -; GCN-NEXT: s_cselect_b32 s3, s74, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x50 +; GCN-NEXT: s_or_b32 s3, s3, s7 +; GCN-NEXT: s_and_b32 s3, s3, 0xff +; GCN-NEXT: s_or_b32 s3, s3, s5 +; GCN-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NEXT: s_or_b32 s3, s3, s4 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5f +; GCN-NEXT: s_cselect_b32 s4, s37, 1 +; GCN-NEXT: s_lshl_b32 s4, s4, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5e +; GCN-NEXT: s_cselect_b32 s5, s38, 1 +; GCN-NEXT: s_and_b32 s5, s5, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 2 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5d +; GCN-NEXT: s_cselect_b32 s5, s39, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5c +; GCN-NEXT: s_cselect_b32 s7, vcc_hi, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_and_b32 s5, s5, 3 +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: s_lshl_b32 s4, s4, 12 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5b +; GCN-NEXT: s_cselect_b32 s5, s94, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5a +; GCN-NEXT: s_cselect_b32 s7, s93, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x59 +; GCN-NEXT: s_cselect_b32 s7, s90, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x58 +; GCN-NEXT: s_cselect_b32 s8, s89, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_and_b32 s5, s5, 15 +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x57 +; GCN-NEXT: s_cselect_b32 s5, s86, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x56 +; GCN-NEXT: s_cselect_b32 s7, s84, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x55 +; GCN-NEXT: s_cselect_b32 s7, s82, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x54 +; GCN-NEXT: s_cselect_b32 s8, s81, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_lshl_b32 s5, s5, 4 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x53 +; GCN-NEXT: s_cselect_b32 s7, s78, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x52 +; GCN-NEXT: s_cselect_b32 s8, s77, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 2 +; GCN-NEXT: s_or_b32 s7, s7, s8 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x51 +; GCN-NEXT: s_cselect_b32 s8, s74, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x50 ; GCN-NEXT: s_cselect_b32 s9, s73, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s3, s9, s3 -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 15 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_and_b32 s1, s1, 0xff -; GCN-NEXT: s_or_b32 s0, s1, s0 -; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x4f -; GCN-NEXT: s_cselect_b32 s1, vcc_lo, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x4e -; GCN-NEXT: s_cselect_b32 s2, s95, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 2 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x4d -; GCN-NEXT: s_cselect_b32 s2, s92, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x4c -; GCN-NEXT: s_cselect_b32 s3, s91, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 3 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s1, s1, 12 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x4b -; GCN-NEXT: s_cselect_b32 s2, s88, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x4a -; GCN-NEXT: s_cselect_b32 s3, s87, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 2 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x49 -; GCN-NEXT: s_cselect_b32 s3, s85, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x48 +; GCN-NEXT: s_or_b32 s8, s9, s8 +; GCN-NEXT: s_and_b32 s8, s8, 3 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 15 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_and_b32 s5, s5, 0xff +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4f +; GCN-NEXT: s_cselect_b32 s5, vcc_lo, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4e +; GCN-NEXT: s_cselect_b32 s7, s95, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4d +; GCN-NEXT: s_cselect_b32 s7, s92, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4c +; GCN-NEXT: s_cselect_b32 s8, s91, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_lshl_b32 s5, s5, 12 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4b +; GCN-NEXT: s_cselect_b32 s7, s88, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4a +; GCN-NEXT: s_cselect_b32 s8, s87, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 2 +; GCN-NEXT: s_or_b32 s7, s7, s8 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x49 +; GCN-NEXT: s_cselect_b32 s8, s85, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x48 ; GCN-NEXT: s_cselect_b32 s9, s83, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s3, s9, s3 -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 15 -; GCN-NEXT: s_lshl_b32 s2, s2, 8 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x47 -; GCN-NEXT: s_cselect_b32 s2, s80, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x46 -; GCN-NEXT: s_cselect_b32 s3, s79, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 2 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x45 -; GCN-NEXT: s_cselect_b32 s3, s76, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x44 +; GCN-NEXT: s_or_b32 s8, s9, s8 +; GCN-NEXT: s_and_b32 s8, s8, 3 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 15 +; GCN-NEXT: s_lshl_b32 s7, s7, 8 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x47 +; GCN-NEXT: s_cselect_b32 s7, s80, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x46 +; GCN-NEXT: s_cselect_b32 s8, s79, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 2 +; GCN-NEXT: s_or_b32 s7, s7, s8 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x45 +; GCN-NEXT: s_cselect_b32 s8, s76, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x44 ; GCN-NEXT: s_cselect_b32 s9, s75, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s3, s9, s3 -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_lshl_b32 s2, s2, 4 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x43 -; GCN-NEXT: s_cselect_b32 s3, s72, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 3 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x42 +; GCN-NEXT: s_or_b32 s8, s9, s8 +; GCN-NEXT: s_and_b32 s8, s8, 3 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_lshl_b32 s7, s7, 4 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x43 +; GCN-NEXT: s_cselect_b32 s8, s72, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 3 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x42 ; GCN-NEXT: s_cselect_b32 s9, s71, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 ; GCN-NEXT: s_lshl_b32 s9, s9, 2 -; GCN-NEXT: s_or_b32 s3, s3, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 64 -; GCN-NEXT: s_cselect_b32 s6, s6, 1 -; GCN-NEXT: s_and_b32 s6, s6, 1 -; GCN-NEXT: s_cmpk_lg_i32 s8, 0x41 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s6, 64 +; GCN-NEXT: s_cselect_b32 s2, s2, 1 +; GCN-NEXT: s_and_b32 s2, s2, 1 +; GCN-NEXT: s_cmpk_lg_i32 s6, 0x41 ; GCN-NEXT: s_cselect_b32 s9, s70, 1 ; GCN-NEXT: s_lshl_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s6, s6, s9 -; GCN-NEXT: s_and_b32 s6, s6, 3 -; GCN-NEXT: s_or_b32 s3, s6, s3 -; GCN-NEXT: s_and_b32 s3, s3, 15 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 0xff -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-NEXT: s_or_b32 s6, s1, s0 -; GCN-NEXT: s_cmp_lg_u32 s8, 63 -; GCN-NEXT: s_cselect_b32 s0, s69, 1 -; GCN-NEXT: s_lshl_b32 s0, s0, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 62 -; GCN-NEXT: s_cselect_b32 s1, s68, 1 -; GCN-NEXT: s_and_b32 s1, s1, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 2 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s8, 61 -; GCN-NEXT: s_cselect_b32 s1, s67, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 60 -; GCN-NEXT: s_cselect_b32 s2, s66, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_and_b32 s1, s1, 3 -; GCN-NEXT: s_or_b32 s0, s1, s0 -; GCN-NEXT: s_lshl_b32 s0, s0, 12 -; GCN-NEXT: s_cmp_lg_u32 s8, 59 -; GCN-NEXT: s_cselect_b32 s1, s63, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 58 -; GCN-NEXT: s_cselect_b32 s2, s61, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 2 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmp_lg_u32 s8, 57 -; GCN-NEXT: s_cselect_b32 s2, s59, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 56 -; GCN-NEXT: s_cselect_b32 s3, s58, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 3 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_and_b32 s1, s1, 15 -; GCN-NEXT: s_lshl_b32 s1, s1, 8 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_cmp_lg_u32 s8, 55 -; GCN-NEXT: s_cselect_b32 s1, s55, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 54 -; GCN-NEXT: s_cselect_b32 s2, s53, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 2 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmp_lg_u32 s8, 53 -; GCN-NEXT: s_cselect_b32 s2, s51, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 52 -; GCN-NEXT: s_cselect_b32 s3, s50, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_or_b32 s2, s2, s9 ; GCN-NEXT: s_and_b32 s2, s2, 3 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s1, s1, 4 -; GCN-NEXT: s_cmp_lg_u32 s8, 51 -; GCN-NEXT: s_cselect_b32 s2, s47, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 50 -; GCN-NEXT: s_cselect_b32 s3, s45, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 2 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmp_lg_u32 s8, 49 -; GCN-NEXT: s_cselect_b32 s3, s43, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 48 +; GCN-NEXT: s_or_b32 s2, s2, s8 +; GCN-NEXT: s_and_b32 s2, s2, 15 +; GCN-NEXT: s_or_b32 s2, s2, s7 +; GCN-NEXT: s_and_b32 s2, s2, 0xff +; GCN-NEXT: s_or_b32 s2, s2, s5 +; GCN-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NEXT: s_or_b32 s2, s2, s4 +; GCN-NEXT: s_cmp_lg_u32 s6, 63 +; GCN-NEXT: s_cselect_b32 s4, s69, 1 +; GCN-NEXT: s_lshl_b32 s4, s4, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 62 +; GCN-NEXT: s_cselect_b32 s5, s68, 1 +; GCN-NEXT: s_and_b32 s5, s5, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 2 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s6, 61 +; GCN-NEXT: s_cselect_b32 s5, s67, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 60 +; GCN-NEXT: s_cselect_b32 s7, s66, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_and_b32 s5, s5, 3 +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: s_lshl_b32 s4, s4, 12 +; GCN-NEXT: s_cmp_lg_u32 s6, 59 +; GCN-NEXT: s_cselect_b32 s5, s63, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 58 +; GCN-NEXT: s_cselect_b32 s7, s61, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 57 +; GCN-NEXT: s_cselect_b32 s7, s59, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 56 +; GCN-NEXT: s_cselect_b32 s8, s58, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_and_b32 s5, s5, 15 +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s6, 55 +; GCN-NEXT: s_cselect_b32 s5, s55, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 54 +; GCN-NEXT: s_cselect_b32 s7, s53, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 53 +; GCN-NEXT: s_cselect_b32 s7, s51, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 52 +; GCN-NEXT: s_cselect_b32 s8, s50, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_lshl_b32 s5, s5, 4 +; GCN-NEXT: s_cmp_lg_u32 s6, 51 +; GCN-NEXT: s_cselect_b32 s7, s47, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 50 +; GCN-NEXT: s_cselect_b32 s8, s45, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 2 +; GCN-NEXT: s_or_b32 s7, s7, s8 +; GCN-NEXT: s_cmp_lg_u32 s6, 49 +; GCN-NEXT: s_cselect_b32 s8, s43, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 48 ; GCN-NEXT: s_cselect_b32 s9, s42, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s3, s9, s3 -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 15 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_and_b32 s1, s1, 0xff -; GCN-NEXT: s_or_b32 s0, s1, s0 -; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 47 -; GCN-NEXT: s_cselect_b32 s1, s65, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 46 -; GCN-NEXT: s_cselect_b32 s2, s64, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 2 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmp_lg_u32 s8, 45 -; GCN-NEXT: s_cselect_b32 s2, s62, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 44 -; GCN-NEXT: s_cselect_b32 s3, s60, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 3 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s1, s1, 12 -; GCN-NEXT: s_cmp_lg_u32 s8, 43 -; GCN-NEXT: s_cselect_b32 s2, s57, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 42 -; GCN-NEXT: s_cselect_b32 s3, s56, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 2 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmp_lg_u32 s8, 41 -; GCN-NEXT: s_cselect_b32 s3, s54, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 40 +; GCN-NEXT: s_or_b32 s8, s9, s8 +; GCN-NEXT: s_and_b32 s8, s8, 3 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 15 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_and_b32 s5, s5, 0xff +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_cmp_lg_u32 s6, 47 +; GCN-NEXT: s_cselect_b32 s5, s65, 1 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 46 +; GCN-NEXT: s_cselect_b32 s7, s64, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 45 +; GCN-NEXT: s_cselect_b32 s7, s62, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 44 +; GCN-NEXT: s_cselect_b32 s8, s60, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_lshl_b32 s5, s5, 12 +; GCN-NEXT: s_cmp_lg_u32 s6, 43 +; GCN-NEXT: s_cselect_b32 s7, s57, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 42 +; GCN-NEXT: s_cselect_b32 s8, s56, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 2 +; GCN-NEXT: s_or_b32 s7, s7, s8 +; GCN-NEXT: s_cmp_lg_u32 s6, 41 +; GCN-NEXT: s_cselect_b32 s8, s54, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 40 ; GCN-NEXT: s_cselect_b32 s9, s52, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s3, s9, s3 -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 15 -; GCN-NEXT: s_lshl_b32 s2, s2, 8 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmp_lg_u32 s8, 39 -; GCN-NEXT: s_cselect_b32 s2, s49, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 38 -; GCN-NEXT: s_cselect_b32 s3, s48, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 2 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmp_lg_u32 s8, 37 -; GCN-NEXT: s_cselect_b32 s3, s46, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 36 +; GCN-NEXT: s_or_b32 s8, s9, s8 +; GCN-NEXT: s_and_b32 s8, s8, 3 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 15 +; GCN-NEXT: s_lshl_b32 s7, s7, 8 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 39 +; GCN-NEXT: s_cselect_b32 s7, s49, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 38 +; GCN-NEXT: s_cselect_b32 s8, s48, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 2 +; GCN-NEXT: s_or_b32 s7, s7, s8 +; GCN-NEXT: s_cmp_lg_u32 s6, 37 +; GCN-NEXT: s_cselect_b32 s8, s46, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 36 ; GCN-NEXT: s_cselect_b32 s9, s44, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s3, s9, s3 -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_lshl_b32 s2, s2, 4 -; GCN-NEXT: s_cmp_lg_u32 s8, 35 -; GCN-NEXT: s_cselect_b32 s3, s41, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 34 +; GCN-NEXT: s_or_b32 s8, s9, s8 +; GCN-NEXT: s_and_b32 s8, s8, 3 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_lshl_b32 s7, s7, 4 +; GCN-NEXT: s_cmp_lg_u32 s6, 35 +; GCN-NEXT: s_cselect_b32 s8, s41, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 34 ; GCN-NEXT: s_cselect_b32 s9, s40, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 ; GCN-NEXT: s_lshl_b32 s9, s9, 2 -; GCN-NEXT: s_or_b32 s3, s3, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 32 -; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_and_b32 s5, s5, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 33 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s6, 32 +; GCN-NEXT: s_cselect_b32 s1, s1, 1 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 33 ; GCN-NEXT: v_readlane_b32 s9, v6, 33 ; GCN-NEXT: s_cselect_b32 s9, s9, 1 ; GCN-NEXT: s_lshl_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s5, s5, s9 -; GCN-NEXT: s_and_b32 s5, s5, 3 -; GCN-NEXT: s_or_b32 s3, s5, s3 -; GCN-NEXT: s_and_b32 s3, s3, 15 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 0xff -; GCN-NEXT: s_or_b32 s1, s2, s1 +; GCN-NEXT: s_or_b32 s1, s1, s9 +; GCN-NEXT: s_and_b32 s1, s1, 3 +; GCN-NEXT: s_or_b32 s1, s1, s8 +; GCN-NEXT: s_and_b32 s1, s1, 15 +; GCN-NEXT: s_or_b32 s1, s1, s7 +; GCN-NEXT: s_and_b32 s1, s1, 0xff +; GCN-NEXT: s_or_b32 s1, s1, s5 ; GCN-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-NEXT: s_or_b32 s0, s1, s0 -; GCN-NEXT: s_cmp_lg_u32 s8, 31 -; GCN-NEXT: v_readlane_b32 s1, v6, 17 -; GCN-NEXT: s_cselect_b32 s1, s1, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 30 -; GCN-NEXT: v_readlane_b32 s2, v6, 16 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_and_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 2 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmp_lg_u32 s8, 29 -; GCN-NEXT: v_readlane_b32 s2, v6, 15 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 28 -; GCN-NEXT: v_readlane_b32 s3, v6, 14 -; GCN-NEXT: s_cselect_b32 s3, s3, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 3 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s1, s1, 12 -; GCN-NEXT: s_cmp_lg_u32 s8, 27 -; GCN-NEXT: v_readlane_b32 s2, v6, 13 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 26 -; GCN-NEXT: v_readlane_b32 s3, v6, 12 -; GCN-NEXT: s_cselect_b32 s3, s3, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 2 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmp_lg_u32 s8, 25 -; GCN-NEXT: v_readlane_b32 s3, v6, 11 -; GCN-NEXT: s_cselect_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 24 -; GCN-NEXT: v_readlane_b32 s5, v6, 10 -; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_and_b32 s5, s5, 1 -; GCN-NEXT: s_or_b32 s3, s5, s3 -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 15 -; GCN-NEXT: s_lshl_b32 s2, s2, 8 -; GCN-NEXT: s_or_b32 s1, s1, s2 -; GCN-NEXT: s_cmp_lg_u32 s8, 23 -; GCN-NEXT: v_readlane_b32 s2, v6, 9 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 22 -; GCN-NEXT: v_readlane_b32 s3, v6, 8 -; GCN-NEXT: s_cselect_b32 s3, s3, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 2 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmp_lg_u32 s8, 21 -; GCN-NEXT: v_readlane_b32 s3, v6, 7 -; GCN-NEXT: s_cselect_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 20 -; GCN-NEXT: v_readlane_b32 s5, v6, 6 -; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_and_b32 s5, s5, 1 -; GCN-NEXT: s_or_b32 s3, s5, s3 -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_lshl_b32 s2, s2, 4 -; GCN-NEXT: s_cmp_lg_u32 s8, 19 -; GCN-NEXT: v_readlane_b32 s3, v6, 5 -; GCN-NEXT: s_cselect_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 18 -; GCN-NEXT: v_readlane_b32 s5, v6, 4 +; GCN-NEXT: s_or_b32 s1, s1, s4 +; GCN-NEXT: s_cmp_lg_u32 s6, 31 +; GCN-NEXT: v_readlane_b32 s4, v6, 17 +; GCN-NEXT: s_cselect_b32 s4, s4, 1 +; GCN-NEXT: s_lshl_b32 s4, s4, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 30 +; GCN-NEXT: v_readlane_b32 s5, v6, 16 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_and_b32 s5, s5, 1 ; GCN-NEXT: s_lshl_b32 s5, s5, 2 -; GCN-NEXT: s_or_b32 s3, s3, s5 -; GCN-NEXT: s_cmp_lg_u32 s8, 17 -; GCN-NEXT: v_readlane_b32 s5, v6, 3 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s6, 29 +; GCN-NEXT: v_readlane_b32 s5, v6, 15 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 ; GCN-NEXT: s_lshl_b32 s5, s5, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 16 -; GCN-NEXT: v_readlane_b32 s9, v6, 2 -; GCN-NEXT: s_cselect_b32 s9, s9, 1 -; GCN-NEXT: s_and_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s5, s9, s5 +; GCN-NEXT: s_cmp_lg_u32 s6, 28 +; GCN-NEXT: v_readlane_b32 s7, v6, 14 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_or_b32 s5, s7, s5 ; GCN-NEXT: s_and_b32 s5, s5, 3 -; GCN-NEXT: s_or_b32 s3, s5, s3 -; GCN-NEXT: s_and_b32 s3, s3, 15 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 0xff -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s1, s1, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 15 -; GCN-NEXT: v_readlane_b32 s2, v6, 32 -; GCN-NEXT: s_cselect_b32 s2, s2, 1 -; GCN-NEXT: s_lshl_b32 s2, s2, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 14 -; GCN-NEXT: v_readlane_b32 s3, v6, 31 -; GCN-NEXT: s_cselect_b32 s3, s3, 1 -; GCN-NEXT: s_and_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 2 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmp_lg_u32 s8, 13 -; GCN-NEXT: v_readlane_b32 s3, v6, 30 -; GCN-NEXT: s_cselect_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 12 -; GCN-NEXT: v_readlane_b32 s5, v6, 29 +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: s_lshl_b32 s4, s4, 12 +; GCN-NEXT: s_cmp_lg_u32 s6, 27 +; GCN-NEXT: v_readlane_b32 s5, v6, 13 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_and_b32 s5, s5, 1 -; GCN-NEXT: s_or_b32 s3, s5, s3 -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_lshl_b32 s2, s2, 12 -; GCN-NEXT: s_cmp_lg_u32 s8, 11 -; GCN-NEXT: v_readlane_b32 s3, v6, 28 -; GCN-NEXT: s_cselect_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 10 -; GCN-NEXT: v_readlane_b32 s5, v6, 27 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 26 +; GCN-NEXT: v_readlane_b32 s7, v6, 12 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 25 +; GCN-NEXT: v_readlane_b32 s7, v6, 11 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 24 +; GCN-NEXT: v_readlane_b32 s8, v6, 10 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_and_b32 s5, s5, 15 +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s6, 23 +; GCN-NEXT: v_readlane_b32 s5, v6, 9 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_and_b32 s5, s5, 1 -; GCN-NEXT: s_lshl_b32 s5, s5, 2 -; GCN-NEXT: s_or_b32 s3, s3, s5 -; GCN-NEXT: s_cmp_lg_u32 s8, 9 -; GCN-NEXT: v_readlane_b32 s5, v6, 26 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 22 +; GCN-NEXT: v_readlane_b32 s7, v6, 8 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 21 +; GCN-NEXT: v_readlane_b32 s7, v6, 7 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 20 +; GCN-NEXT: v_readlane_b32 s8, v6, 6 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_lshl_b32 s5, s5, 4 +; GCN-NEXT: s_cmp_lg_u32 s6, 19 +; GCN-NEXT: v_readlane_b32 s7, v6, 5 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 18 +; GCN-NEXT: v_readlane_b32 s8, v6, 4 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 2 +; GCN-NEXT: s_or_b32 s7, s7, s8 +; GCN-NEXT: s_cmp_lg_u32 s6, 17 +; GCN-NEXT: v_readlane_b32 s8, v6, 3 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 16 +; GCN-NEXT: v_readlane_b32 s9, v6, 2 +; GCN-NEXT: s_cselect_b32 s9, s9, 1 +; GCN-NEXT: s_and_b32 s9, s9, 1 +; GCN-NEXT: s_or_b32 s8, s9, s8 +; GCN-NEXT: s_and_b32 s8, s8, 3 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 15 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_and_b32 s5, s5, 0xff +; GCN-NEXT: s_or_b32 s4, s5, s4 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_cmp_lg_u32 s6, 15 +; GCN-NEXT: v_readlane_b32 s5, v6, 32 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_lshl_b32 s5, s5, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 8 +; GCN-NEXT: s_lshl_b32 s5, s5, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 14 +; GCN-NEXT: v_readlane_b32 s7, v6, 31 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_and_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 13 +; GCN-NEXT: v_readlane_b32 s7, v6, 30 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 12 +; GCN-NEXT: v_readlane_b32 s8, v6, 29 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 3 +; GCN-NEXT: s_or_b32 s5, s7, s5 +; GCN-NEXT: s_lshl_b32 s5, s5, 12 +; GCN-NEXT: s_cmp_lg_u32 s6, 11 +; GCN-NEXT: v_readlane_b32 s7, v6, 28 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 10 +; GCN-NEXT: v_readlane_b32 s8, v6, 27 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 2 +; GCN-NEXT: s_or_b32 s7, s7, s8 +; GCN-NEXT: s_cmp_lg_u32 s6, 9 +; GCN-NEXT: v_readlane_b32 s8, v6, 26 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 8 ; GCN-NEXT: v_readlane_b32 s9, v6, 25 ; GCN-NEXT: s_cselect_b32 s9, s9, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s5, s9, s5 -; GCN-NEXT: s_and_b32 s5, s5, 3 -; GCN-NEXT: s_or_b32 s3, s5, s3 -; GCN-NEXT: s_and_b32 s3, s3, 15 -; GCN-NEXT: s_lshl_b32 s3, s3, 8 -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_cmp_lg_u32 s8, 7 -; GCN-NEXT: v_readlane_b32 s3, v6, 24 -; GCN-NEXT: s_cselect_b32 s3, s3, 1 -; GCN-NEXT: s_lshl_b32 s3, s3, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 6 -; GCN-NEXT: v_readlane_b32 s5, v6, 23 -; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_and_b32 s5, s5, 1 -; GCN-NEXT: s_lshl_b32 s5, s5, 2 -; GCN-NEXT: s_or_b32 s3, s3, s5 -; GCN-NEXT: s_cmp_lg_u32 s8, 5 -; GCN-NEXT: v_readlane_b32 s5, v6, 22 -; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_lshl_b32 s5, s5, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 4 +; GCN-NEXT: s_or_b32 s8, s9, s8 +; GCN-NEXT: s_and_b32 s8, s8, 3 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_and_b32 s7, s7, 15 +; GCN-NEXT: s_lshl_b32 s7, s7, 8 +; GCN-NEXT: s_or_b32 s5, s5, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 7 +; GCN-NEXT: v_readlane_b32 s7, v6, 24 +; GCN-NEXT: s_cselect_b32 s7, s7, 1 +; GCN-NEXT: s_lshl_b32 s7, s7, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 6 +; GCN-NEXT: v_readlane_b32 s8, v6, 23 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_and_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 2 +; GCN-NEXT: s_or_b32 s7, s7, s8 +; GCN-NEXT: s_cmp_lg_u32 s6, 5 +; GCN-NEXT: v_readlane_b32 s8, v6, 22 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 4 ; GCN-NEXT: v_readlane_b32 s9, v6, 21 ; GCN-NEXT: s_cselect_b32 s9, s9, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 -; GCN-NEXT: s_or_b32 s5, s9, s5 -; GCN-NEXT: s_and_b32 s5, s5, 3 -; GCN-NEXT: s_or_b32 s3, s5, s3 -; GCN-NEXT: s_lshl_b32 s3, s3, 4 -; GCN-NEXT: s_cmp_lg_u32 s8, 3 -; GCN-NEXT: v_readlane_b32 s5, v6, 20 -; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_lshl_b32 s5, s5, 3 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_or_b32 s8, s9, s8 +; GCN-NEXT: s_and_b32 s8, s8, 3 +; GCN-NEXT: s_or_b32 s7, s8, s7 +; GCN-NEXT: s_lshl_b32 s7, s7, 4 +; GCN-NEXT: s_cmp_lg_u32 s6, 3 +; GCN-NEXT: v_readlane_b32 s8, v6, 20 +; GCN-NEXT: s_cselect_b32 s8, s8, 1 +; GCN-NEXT: s_lshl_b32 s8, s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s6, 2 ; GCN-NEXT: v_readlane_b32 s9, v6, 19 ; GCN-NEXT: s_cselect_b32 s9, s9, 1 ; GCN-NEXT: s_and_b32 s9, s9, 1 ; GCN-NEXT: s_lshl_b32 s9, s9, 2 -; GCN-NEXT: s_or_b32 s5, s5, s9 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, 1 -; GCN-NEXT: s_and_b32 s4, s4, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 1 -; GCN-NEXT: v_readlane_b32 s8, v6, 18 -; GCN-NEXT: s_cselect_b32 s8, s8, 1 -; GCN-NEXT: s_lshl_b32 s8, s8, 1 -; GCN-NEXT: s_or_b32 s4, s4, s8 -; GCN-NEXT: s_and_b32 s4, s4, 3 -; GCN-NEXT: s_or_b32 s4, s4, s5 -; GCN-NEXT: s_and_b32 s4, s4, 15 -; GCN-NEXT: s_or_b32 s3, s4, s3 -; GCN-NEXT: s_and_b32 s3, s3, 0xff -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, 1 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 1 +; GCN-NEXT: v_readlane_b32 s6, v6, 18 +; GCN-NEXT: s_cselect_b32 s6, s6, 1 +; GCN-NEXT: s_lshl_b32 s6, s6, 1 +; GCN-NEXT: s_or_b32 s0, s0, s6 +; GCN-NEXT: s_and_b32 s0, s0, 3 +; GCN-NEXT: s_or_b32 s0, s0, s8 +; GCN-NEXT: s_and_b32 s0, s0, 15 +; GCN-NEXT: s_or_b32 s0, s0, s7 +; GCN-NEXT: s_and_b32 s0, s0, 0xff +; GCN-NEXT: s_or_b32 s0, s0, s5 +; GCN-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NEXT: s_or_b32 s0, s0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_readlane_b32 s0, v6, 0 ; GCN-NEXT: v_readlane_b32 s1, v6, 1 ; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 213813a94fc859..3aa0437f0466e3 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 @@ -22,7 +22,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: insertelement_v2f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 @@ -40,7 +40,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -53,7 +53,7 @@ define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: insertelement_v2f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -71,7 +71,7 @@ define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -84,7 +84,7 @@ define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: insertelement_v2i32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -102,7 +102,7 @@ define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32 define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x3e7 @@ -115,7 +115,7 @@ define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: insertelement_v2i32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e7 @@ -135,8 +135,8 @@ define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32 define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -150,8 +150,8 @@ define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -170,8 +170,8 @@ define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s1, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -185,8 +185,8 @@ define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s1, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -205,8 +205,8 @@ define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s2, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -220,8 +220,8 @@ define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s2, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -240,8 +240,8 @@ define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -255,8 +255,8 @@ define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -275,8 +275,8 @@ define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32> %a) nounwind { ; SI-LABEL: insertelement_v4i32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_movk_i32 s0, 0x3e7 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -290,8 +290,8 @@ define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32 ; ; VI-LABEL: insertelement_v4i32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_movk_i32 s0, 0x3e7 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -310,8 +310,8 @@ define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x float> %a) nounwind { ; SI-LABEL: insertelement_v3f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -323,8 +323,8 @@ define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x flo ; ; VI-LABEL: insertelement_v3f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -341,8 +341,8 @@ define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x flo define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x float> %a) nounwind { ; SI-LABEL: insertelement_v3f32_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 @@ -354,8 +354,8 @@ define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x flo ; ; VI-LABEL: insertelement_v3f32_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 @@ -497,8 +497,8 @@ define <12 x float> @insertelement_to_v12f32_undef() nounwind { define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -516,8 +516,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -540,20 +540,20 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x4 +; SI-NEXT: s_load_dword s10, s[8:9], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x4 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s8, 2 +; SI-NEXT: s_cmp_lg_u32 s10, 2 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: s_cmp_lg_u32 s8, 1 +; SI-NEXT: s_cmp_lg_u32 s10, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cmp_lg_u32 s10, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -564,20 +564,20 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 ; ; VI-LABEL: dynamic_insertelement_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10 +; VI-NEXT: s_load_dword s10, s[8:9], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s8, 2 +; VI-NEXT: s_cmp_lg_u32 s10, 2 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_cmp_lg_u32 s8, 1 +; VI-NEXT: s_cmp_lg_u32 s10, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s8, 0 +; VI-NEXT: s_cmp_lg_u32 s10, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -593,24 +593,24 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x4 +; SI-NEXT: s_load_dword s10, s[8:9], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x4 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s8, 3 +; SI-NEXT: s_cmp_lg_u32 s10, 3 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_cmp_lg_u32 s8, 2 +; SI-NEXT: s_cmp_lg_u32 s10, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s8, 1 +; SI-NEXT: s_cmp_lg_u32 s10, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cmp_lg_u32 s10, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -621,24 +621,24 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 ; ; VI-LABEL: dynamic_insertelement_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10 +; VI-NEXT: s_load_dword s10, s[8:9], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s8, 3 +; VI-NEXT: s_cmp_lg_u32 s10, 3 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_cmp_lg_u32 s8, 2 +; VI-NEXT: s_cmp_lg_u32 s10, 2 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s8, 1 +; VI-NEXT: s_cmp_lg_u32 s10, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s8, 0 +; VI-NEXT: s_cmp_lg_u32 s10, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -654,44 +654,44 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x10 +; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8 +; SI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 +; SI-NEXT: s_load_dword s8, s[8:9], 0x10 ; SI-NEXT: v_mov_b32_e32 v8, 0x40a00000 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s15, 0x100f000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: s_mov_b32 m0, s8 ; SI-NEXT: v_movreld_b32_e32 v0, v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x40 +; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x40 ; VI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 ; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, v8 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -705,21 +705,21 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v9f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dword s4, s[6:7], 0x18 -; SI-NEXT: s_load_dword s5, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 +; SI-NEXT: s_load_dword s4, s[8:9], 0x18 +; SI-NEXT: s_load_dword s5, s[8:9], 0x20 ; SI-NEXT: v_mov_b32_e32 v9, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 ; SI-NEXT: v_mov_b32_e32 v8, s4 ; SI-NEXT: s_mov_b32 m0, s5 ; SI-NEXT: s_mov_b32 s2, -1 @@ -731,20 +731,20 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 ; ; VI-LABEL: dynamic_insertelement_v9f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dword s4, s[6:7], 0x60 -; VI-NEXT: s_load_dword s5, s[6:7], 0x80 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 +; VI-NEXT: s_load_dword s4, s[8:9], 0x60 +; VI-NEXT: s_load_dword s5, s[8:9], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v9, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: s_mov_b32 m0, s5 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 @@ -762,21 +762,21 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, <10 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v10f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18 -; SI-NEXT: s_load_dword s6, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 +; SI-NEXT: s_load_dword s6, s[8:9], 0x20 ; SI-NEXT: v_mov_b32_e32 v10, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 ; SI-NEXT: v_mov_b32_e32 v8, s4 ; SI-NEXT: v_mov_b32_e32 v9, s5 ; SI-NEXT: s_mov_b32 m0, s6 @@ -789,20 +789,20 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v10f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x60 -; VI-NEXT: s_load_dword s6, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x60 +; VI-NEXT: s_load_dword s6, s[8:9], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v10, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: s_mov_b32 m0, s6 @@ -821,25 +821,26 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, <11 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v11f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x18 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s7, s[8:9], 0x20 ; SI-NEXT: v_mov_b32_e32 v11, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_mov_b32_e32 v10, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_mov_b32 m0, s7 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_movreld_b32_e32 v0, v11 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -849,27 +850,27 @@ define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v11f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x60 +; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s7, s[8:9], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v11, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: v_mov_b32_e32 v10, s6 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, s7 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v8, s8 -; VI-NEXT: v_mov_b32_e32 v9, s9 -; VI-NEXT: v_mov_b32_e32 v10, s10 -; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, v11 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -883,26 +884,26 @@ define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, <12 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v12f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x18 +; SI-NEXT: s_load_dword s8, s[8:9], 0x20 ; SI-NEXT: v_mov_b32_e32 v12, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s19 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: s_mov_b32 m0, s8 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_movreld_b32_e32 v0, v12 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -912,28 +913,27 @@ define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v12f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x60 +; VI-NEXT: s_load_dword s8, s[8:9], 0x80 ; VI-NEXT: v_mov_b32_e32 v12, 0x40a00000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: v_mov_b32_e32 v10, s6 +; VI-NEXT: v_mov_b32_e32 v11, s7 +; VI-NEXT: s_mov_b32 m0, s8 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v8, s8 -; VI-NEXT: v_mov_b32_e32 v9, s9 -; VI-NEXT: v_mov_b32_e32 v10, s10 -; VI-NEXT: v_mov_b32_e32 v11, s11 -; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, v12 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -947,29 +947,29 @@ define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, <16 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x10 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x10 +; SI-NEXT: s_load_dword s4, s[8:9], 0x20 ; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s19 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v13, s21 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: v_movreld_b32_e32 v0, v16 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 @@ -980,29 +980,29 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v16f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 +; VI-NEXT: s_load_dword s4, s[8:9], 0x80 ; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s16 -; VI-NEXT: v_mov_b32_e32 v9, s17 -; VI-NEXT: v_mov_b32_e32 v10, s18 -; VI-NEXT: v_mov_b32_e32 v11, s19 -; VI-NEXT: v_mov_b32_e32 v12, s20 -; VI-NEXT: v_mov_b32_e32 v13, s21 -; VI-NEXT: v_mov_b32_e32 v14, s22 -; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 ; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, v16 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 @@ -1018,8 +1018,8 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1034,8 +1034,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1055,17 +1055,17 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dword s10, s[8:9], 0x8 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s8, 2 +; SI-NEXT: s_cmp_lg_u32 s10, 2 ; SI-NEXT: s_cselect_b32 s2, s2, 5 -; SI-NEXT: s_cmp_lg_u32 s8, 1 +; SI-NEXT: s_cmp_lg_u32 s10, 1 ; SI-NEXT: s_cselect_b32 s1, s1, 5 -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cmp_lg_u32 s10, 0 ; SI-NEXT: s_cselect_b32 s0, s0, 5 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 @@ -1075,17 +1075,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 ; ; VI-LABEL: dynamic_insertelement_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dword s10, s[8:9], 0x20 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s8, 2 +; VI-NEXT: s_cmp_lg_u32 s10, 2 ; VI-NEXT: s_cselect_b32 s2, s2, 5 -; VI-NEXT: s_cmp_lg_u32 s8, 1 +; VI-NEXT: s_cmp_lg_u32 s10, 1 ; VI-NEXT: s_cselect_b32 s1, s1, 5 -; VI-NEXT: s_cmp_lg_u32 s8, 0 +; VI-NEXT: s_cmp_lg_u32 s10, 0 ; VI-NEXT: s_cselect_b32 s0, s0, 5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1100,21 +1100,21 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { ; SI-LABEL: dynamic_insertelement_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dword s9, s[6:7], 0x11 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; SI-NEXT: s_load_dword s10, s[8:9], 0x8 +; SI-NEXT: s_load_dword s11, s[8:9], 0x11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s8, 3 -; SI-NEXT: s_cselect_b32 s3, s9, s3 -; SI-NEXT: s_cmp_eq_u32 s8, 2 -; SI-NEXT: s_cselect_b32 s2, s9, s2 -; SI-NEXT: s_cmp_eq_u32 s8, 1 -; SI-NEXT: s_cselect_b32 s1, s9, s1 -; SI-NEXT: s_cmp_eq_u32 s8, 0 -; SI-NEXT: s_cselect_b32 s0, s9, s0 +; SI-NEXT: s_cmp_eq_u32 s10, 3 +; SI-NEXT: s_cselect_b32 s3, s11, s3 +; SI-NEXT: s_cmp_eq_u32 s10, 2 +; SI-NEXT: s_cselect_b32 s2, s11, s2 +; SI-NEXT: s_cmp_eq_u32 s10, 1 +; SI-NEXT: s_cselect_b32 s1, s11, s1 +; SI-NEXT: s_cmp_eq_u32 s10, 0 +; SI-NEXT: s_cselect_b32 s0, s11, s0 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_mov_b32_e32 v2, s2 @@ -1124,21 +1124,21 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 ; ; VI-LABEL: dynamic_insertelement_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dword s9, s[6:7], 0x44 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dword s10, s[8:9], 0x20 +; VI-NEXT: s_load_dword s11, s[8:9], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s8, 3 -; VI-NEXT: s_cselect_b32 s3, s9, s3 -; VI-NEXT: s_cmp_eq_u32 s8, 2 -; VI-NEXT: s_cselect_b32 s2, s9, s2 -; VI-NEXT: s_cmp_eq_u32 s8, 1 -; VI-NEXT: s_cselect_b32 s1, s9, s1 -; VI-NEXT: s_cmp_eq_u32 s8, 0 -; VI-NEXT: s_cselect_b32 s0, s9, s0 +; VI-NEXT: s_cmp_eq_u32 s10, 3 +; VI-NEXT: s_cselect_b32 s3, s11, s3 +; VI-NEXT: s_cmp_eq_u32 s10, 2 +; VI-NEXT: s_cselect_b32 s2, s11, s2 +; VI-NEXT: s_cmp_eq_u32 s10, 1 +; VI-NEXT: s_cselect_b32 s1, s11, s1 +; VI-NEXT: s_cmp_eq_u32 s10, 0 +; VI-NEXT: s_cselect_b32 s0, s11, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -1153,42 +1153,42 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x10 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8 +; SI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 +; SI-NEXT: s_load_dword s8, s[8:9], 0x10 +; SI-NEXT: s_mov_b32 s15, 0x100f000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: s_mov_b32 m0, s8 ; SI-NEXT: v_movreld_b32_e32 v0, 5 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x40 +; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 ; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -1202,21 +1202,21 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v9i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dword s4, s[6:7], 0x18 -; SI-NEXT: s_load_dword s5, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 +; SI-NEXT: s_load_dword s4, s[8:9], 0x18 +; SI-NEXT: s_load_dword s5, s[8:9], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 ; SI-NEXT: v_mov_b32_e32 v8, s4 ; SI-NEXT: s_mov_b32 m0, s5 ; SI-NEXT: v_movreld_b32_e32 v0, 5 @@ -1227,21 +1227,21 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 ; ; VI-LABEL: dynamic_insertelement_v9i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dword s4, s[6:7], 0x60 -; VI-NEXT: s_load_dword s5, s[6:7], 0x80 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 +; VI-NEXT: s_load_dword s4, s[8:9], 0x60 +; VI-NEXT: s_load_dword s5, s[8:9], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: s_mov_b32 m0, s5 ; VI-NEXT: v_movreld_b32_e32 v0, 5 @@ -1257,21 +1257,21 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, <10 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v10i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18 -; SI-NEXT: s_load_dword s6, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 +; SI-NEXT: s_load_dword s6, s[8:9], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 ; SI-NEXT: v_mov_b32_e32 v8, s4 ; SI-NEXT: v_mov_b32_e32 v9, s5 ; SI-NEXT: s_mov_b32 m0, s6 @@ -1283,20 +1283,20 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v10i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x60 -; VI-NEXT: s_load_dword s6, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x60 +; VI-NEXT: s_load_dword s6, s[8:9], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: s_mov_b32 m0, s6 @@ -1314,25 +1314,26 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, <11 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v11i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x18 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s7, s[8:9], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_mov_b32_e32 v10, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_mov_b32 m0, s7 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -1341,26 +1342,26 @@ define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v11i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x60 +; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: s_load_dword s7, s[8:9], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: v_mov_b32_e32 v10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v8, s8 -; VI-NEXT: v_mov_b32_e32 v9, s9 -; VI-NEXT: v_mov_b32_e32 v10, s10 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 m0, s7 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -1374,26 +1375,26 @@ define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, <12 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v12i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x18 +; SI-NEXT: s_load_dword s8, s[8:9], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s19 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: s_mov_b32 m0, s8 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -1402,27 +1403,26 @@ define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v12i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x60 +; VI-NEXT: s_load_dword s8, s[8:9], 0x80 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v8, s8 -; VI-NEXT: v_mov_b32_e32 v9, s9 -; VI-NEXT: v_mov_b32_e32 v10, s10 -; VI-NEXT: v_mov_b32_e32 v11, s11 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: v_mov_b32_e32 v10, s6 +; VI-NEXT: v_mov_b32_e32 v11, s7 +; VI-NEXT: s_mov_b32 m0, s8 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -1436,28 +1436,28 @@ define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, <16 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x10 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x10 +; SI-NEXT: s_load_dword s4, s[8:9], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s19 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v13, s21 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 @@ -1468,28 +1468,28 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v16i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 +; VI-NEXT: s_load_dword s4, s[8:9], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s16 -; VI-NEXT: v_mov_b32_e32 v9, s17 -; VI-NEXT: v_mov_b32_e32 v10, s18 -; VI-NEXT: v_mov_b32_e32 v11, s19 -; VI-NEXT: v_mov_b32_e32 v12, s20 -; VI-NEXT: v_mov_b32_e32 v13, s21 -; VI-NEXT: v_mov_b32_e32 v14, s22 -; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 ; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 @@ -1505,7 +1505,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1522,7 +1522,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1544,8 +1544,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s8, s[6:7], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_load_dword s8, s[8:9], 0x4 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1565,8 +1565,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; ; VI-LABEL: dynamic_insertelement_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s8, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s8, s[8:9], 0x10 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1592,9 +1592,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s5, s[6:7], 0xa +; SI-NEXT: s_load_dword s4, s[8:9], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s5, s[8:9], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1609,9 +1609,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 ; ; VI-LABEL: dynamic_insertelement_v2i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[6:7], 0x28 +; VI-NEXT: s_load_dword s4, s[8:9], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s5, s[8:9], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1634,9 +1634,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s5, s[6:7], 0xa +; SI-NEXT: s_load_dword s4, s[8:9], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s5, s[8:9], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1654,9 +1654,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 ; ; VI-LABEL: dynamic_insertelement_v3i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[6:7], 0x28 +; VI-NEXT: s_load_dword s4, s[8:9], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s5, s[8:9], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1679,9 +1679,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s5, s[6:7], 0xa +; SI-NEXT: s_load_dword s4, s[8:9], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s5, s[8:9], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1696,9 +1696,9 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 ; ; VI-LABEL: dynamic_insertelement_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[6:7], 0x28 +; VI-NEXT: s_load_dword s4, s[8:9], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s5, s[8:9], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1718,46 +1718,46 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %a.ptr, i32 %b) nounwind { ; SI-LABEL: s_dynamic_insertelement_v8i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_load_dword s8, s[8:9], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_lshl_b32 s0, s4, 3 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_lshl_b32 s0, s8, 3 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 -; SI-NEXT: s_and_b32 s5, s1, 0x5050505 +; SI-NEXT: s_and_b32 s9, s1, 0x5050505 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] -; SI-NEXT: s_and_b32 s4, s0, 0x5050505 -; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3] +; SI-NEXT: s_and_b32 s8, s0, 0x5050505 +; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_dynamic_insertelement_v8i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_mov_b32 s11, 0x1100f000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s8, s[8:9], 0x10 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_lshl_b32 s0, s4, 3 -; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_lshl_b32 s0, s8, 3 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 -; VI-NEXT: s_and_b32 s5, s1, 0x5050505 +; VI-NEXT: s_and_b32 s9, s1, 0x5050505 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] -; VI-NEXT: s_and_b32 s4, s0, 0x5050505 -; VI-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3] +; VI-NEXT: s_and_b32 s8, s0, 0x5050505 +; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %a = load <8 x i8>, ptr addrspace(4) %a.ptr, align 4 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b @@ -1768,191 +1768,191 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <16 x i8> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x4 -; SI-NEXT: s_load_dword s4, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x4 +; SI-NEXT: s_load_dword s10, s[8:9], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s5, s11, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 15 -; SI-NEXT: s_cselect_b32 s5, s5, 5 -; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_lshr_b32 s6, s11, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 14 -; SI-NEXT: s_cselect_b32 s6, s6, 5 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_lshr_b32 s6, s11, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 13 -; SI-NEXT: s_cselect_b32 s6, s6, 5 -; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 12 -; SI-NEXT: s_cselect_b32 s7, s11, 5 -; SI-NEXT: s_and_b32 s7, s7, 0xff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_lshr_b32 s6, s10, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 11 -; SI-NEXT: s_cselect_b32 s6, s6, 5 -; SI-NEXT: s_lshl_b32 s6, s6, 24 -; SI-NEXT: s_lshr_b32 s7, s10, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 10 +; SI-NEXT: s_lshr_b32 s8, s7, 24 +; SI-NEXT: s_cmp_lg_u32 s10, 15 +; SI-NEXT: s_cselect_b32 s8, s8, 5 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_lshr_b32 s9, s7, 16 +; SI-NEXT: s_cmp_lg_u32 s10, 14 +; SI-NEXT: s_cselect_b32 s9, s9, 5 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_lshr_b32 s9, s7, 8 +; SI-NEXT: s_cmp_lg_u32 s10, 13 +; SI-NEXT: s_cselect_b32 s9, s9, 5 +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_cmp_lg_u32 s10, 12 ; SI-NEXT: s_cselect_b32 s7, s7, 5 ; SI-NEXT: s_and_b32 s7, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_lshr_b32 s7, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 9 -; SI-NEXT: s_cselect_b32 s7, s7, 5 -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 8 -; SI-NEXT: s_cselect_b32 s10, s10, 5 -; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_or_b32 s7, s7, s9 ; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_lshr_b32 s7, s9, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 7 -; SI-NEXT: s_cselect_b32 s7, s7, 5 -; SI-NEXT: s_lshl_b32 s7, s7, 24 -; SI-NEXT: s_lshr_b32 s10, s9, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 6 -; SI-NEXT: s_cselect_b32 s10, s10, 5 -; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_or_b32 s7, s7, s10 -; SI-NEXT: s_lshr_b32 s10, s9, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 5 -; SI-NEXT: s_cselect_b32 s10, s10, 5 -; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 4 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_lshr_b32 s8, s6, 24 +; SI-NEXT: s_cmp_lg_u32 s10, 11 +; SI-NEXT: s_cselect_b32 s8, s8, 5 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_cmp_lg_u32 s10, 10 ; SI-NEXT: s_cselect_b32 s9, s9, 5 ; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: s_lshr_b32 s9, s8, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 3 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_lshr_b32 s9, s6, 8 +; SI-NEXT: s_cmp_lg_u32 s10, 9 ; SI-NEXT: s_cselect_b32 s9, s9, 5 -; SI-NEXT: s_lshl_b32 s9, s9, 24 -; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 2 -; SI-NEXT: s_cselect_b32 s10, s10, 5 -; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_lshr_b32 s10, s8, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 1 -; SI-NEXT: s_cselect_b32 s10, s10, 5 -; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_cselect_b32 s4, s8, 5 +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_cmp_lg_u32 s10, 8 +; SI-NEXT: s_cselect_b32 s6, s6, 5 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_or_b32 s6, s6, s9 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_lshr_b32 s8, s5, 24 +; SI-NEXT: s_cmp_lg_u32 s10, 7 +; SI-NEXT: s_cselect_b32 s8, s8, 5 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_cmp_lg_u32 s10, 6 +; SI-NEXT: s_cselect_b32 s9, s9, 5 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_lshr_b32 s9, s5, 8 +; SI-NEXT: s_cmp_lg_u32 s10, 5 +; SI-NEXT: s_cselect_b32 s9, s9, 5 +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_cmp_lg_u32 s10, 4 +; SI-NEXT: s_cselect_b32 s5, s5, 5 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_or_b32 s5, s5, s9 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_lshr_b32 s8, s4, 24 +; SI-NEXT: s_cmp_lg_u32 s10, 3 +; SI-NEXT: s_cselect_b32 s8, s8, 5 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_lshr_b32 s9, s4, 16 +; SI-NEXT: s_cmp_lg_u32 s10, 2 +; SI-NEXT: s_cselect_b32 s9, s9, 5 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_lshr_b32 s9, s4, 8 +; SI-NEXT: s_cmp_lg_u32 s10, 1 +; SI-NEXT: s_cselect_b32 s9, s9, 5 +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_cmp_lg_u32 s10, 0 +; SI-NEXT: s_cselect_b32 s4, s4, 5 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_or_b32 s4, s4, s10 -; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s4, s4, s9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x10 -; VI-NEXT: s_load_dword s4, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x10 +; VI-NEXT: s_load_dword s10, s[8:9], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s11, 24 -; VI-NEXT: s_cmp_lg_u32 s4, 15 -; VI-NEXT: s_cselect_b32 s5, s5, 5 -; VI-NEXT: s_lshl_b32 s5, s5, 8 -; VI-NEXT: s_lshr_b32 s6, s11, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 14 -; VI-NEXT: s_cselect_b32 s6, s6, 5 -; VI-NEXT: s_and_b32 s6, s6, 0xff -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s6, s11, 8 -; VI-NEXT: s_cmp_lg_u32 s4, 13 -; VI-NEXT: s_cselect_b32 s6, s6, 5 -; VI-NEXT: s_lshl_b32 s6, s6, 8 -; VI-NEXT: s_cmp_lg_u32 s4, 12 -; VI-NEXT: s_cselect_b32 s7, s11, 5 -; VI-NEXT: s_and_b32 s7, s7, 0xff -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_lshr_b32 s6, s10, 24 -; VI-NEXT: s_cmp_lg_u32 s4, 11 -; VI-NEXT: s_cselect_b32 s6, s6, 5 -; VI-NEXT: s_lshl_b32 s6, s6, 8 -; VI-NEXT: s_lshr_b32 s7, s10, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 10 +; VI-NEXT: s_lshr_b32 s8, s7, 24 +; VI-NEXT: s_cmp_lg_u32 s10, 15 +; VI-NEXT: s_cselect_b32 s8, s8, 5 +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_lshr_b32 s9, s7, 16 +; VI-NEXT: s_cmp_lg_u32 s10, 14 +; VI-NEXT: s_cselect_b32 s9, s9, 5 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_lshr_b32 s9, s7, 8 +; VI-NEXT: s_cmp_lg_u32 s10, 13 +; VI-NEXT: s_cselect_b32 s9, s9, 5 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_cmp_lg_u32 s10, 12 ; VI-NEXT: s_cselect_b32 s7, s7, 5 ; VI-NEXT: s_and_b32 s7, s7, 0xff -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_lshr_b32 s7, s10, 8 -; VI-NEXT: s_cmp_lg_u32 s4, 9 -; VI-NEXT: s_cselect_b32 s7, s7, 5 -; VI-NEXT: s_lshl_b32 s7, s7, 8 -; VI-NEXT: s_cmp_lg_u32 s4, 8 -; VI-NEXT: s_cselect_b32 s10, s10, 5 -; VI-NEXT: s_and_b32 s10, s10, 0xff -; VI-NEXT: s_or_b32 s7, s10, s7 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_lshr_b32 s7, s9, 24 -; VI-NEXT: s_cmp_lg_u32 s4, 7 -; VI-NEXT: s_cselect_b32 s7, s7, 5 -; VI-NEXT: s_lshl_b32 s7, s7, 8 -; VI-NEXT: s_lshr_b32 s10, s9, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 6 -; VI-NEXT: s_cselect_b32 s10, s10, 5 -; VI-NEXT: s_and_b32 s10, s10, 0xff -; VI-NEXT: s_or_b32 s7, s10, s7 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_lshr_b32 s10, s9, 8 -; VI-NEXT: s_cmp_lg_u32 s4, 5 -; VI-NEXT: s_cselect_b32 s10, s10, 5 -; VI-NEXT: s_lshl_b32 s10, s10, 8 -; VI-NEXT: s_cmp_lg_u32 s4, 4 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_lshr_b32 s8, s6, 24 +; VI-NEXT: s_cmp_lg_u32 s10, 11 +; VI-NEXT: s_cselect_b32 s8, s8, 5 +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_lshr_b32 s9, s6, 16 +; VI-NEXT: s_cmp_lg_u32 s10, 10 ; VI-NEXT: s_cselect_b32 s9, s9, 5 ; VI-NEXT: s_and_b32 s9, s9, 0xff -; VI-NEXT: s_or_b32 s9, s9, s10 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_or_b32 s7, s9, s7 -; VI-NEXT: s_lshr_b32 s9, s8, 24 -; VI-NEXT: s_cmp_lg_u32 s4, 3 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_lshr_b32 s9, s6, 8 +; VI-NEXT: s_cmp_lg_u32 s10, 9 ; VI-NEXT: s_cselect_b32 s9, s9, 5 ; VI-NEXT: s_lshl_b32 s9, s9, 8 -; VI-NEXT: s_lshr_b32 s10, s8, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 2 -; VI-NEXT: s_cselect_b32 s10, s10, 5 -; VI-NEXT: s_and_b32 s10, s10, 0xff -; VI-NEXT: s_or_b32 s9, s10, s9 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s10, s8, 8 -; VI-NEXT: s_cmp_lg_u32 s4, 1 -; VI-NEXT: s_cselect_b32 s10, s10, 5 -; VI-NEXT: s_lshl_b32 s10, s10, 8 -; VI-NEXT: s_cmp_lg_u32 s4, 0 -; VI-NEXT: s_cselect_b32 s4, s8, 5 +; VI-NEXT: s_cmp_lg_u32 s10, 8 +; VI-NEXT: s_cselect_b32 s6, s6, 5 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_or_b32 s6, s6, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: s_lshr_b32 s8, s5, 24 +; VI-NEXT: s_cmp_lg_u32 s10, 7 +; VI-NEXT: s_cselect_b32 s8, s8, 5 +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_lshr_b32 s9, s5, 16 +; VI-NEXT: s_cmp_lg_u32 s10, 6 +; VI-NEXT: s_cselect_b32 s9, s9, 5 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_lshr_b32 s9, s5, 8 +; VI-NEXT: s_cmp_lg_u32 s10, 5 +; VI-NEXT: s_cselect_b32 s9, s9, 5 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_cmp_lg_u32 s10, 4 +; VI-NEXT: s_cselect_b32 s5, s5, 5 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_or_b32 s5, s5, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_lshr_b32 s8, s4, 24 +; VI-NEXT: s_cmp_lg_u32 s10, 3 +; VI-NEXT: s_cselect_b32 s8, s8, 5 +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_lshr_b32 s9, s4, 16 +; VI-NEXT: s_cmp_lg_u32 s10, 2 +; VI-NEXT: s_cselect_b32 s9, s9, 5 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_lshr_b32 s9, s4, 8 +; VI-NEXT: s_cmp_lg_u32 s10, 1 +; VI-NEXT: s_cselect_b32 s9, s9, 5 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_cmp_lg_u32 s10, 0 +; VI-NEXT: s_cselect_b32 s4, s4, 5 ; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_or_b32 s4, s4, s10 -; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s8 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <16 x i8> %a, i8 5, i32 %b @@ -1965,8 +1965,8 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <1 define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-LABEL: insert_split_bb: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[8:9], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB42_4 @@ -1992,8 +1992,8 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: insert_split_bb: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: s_cbranch_scc0 .LBB42_4 @@ -2038,16 +2038,16 @@ endif: define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x18 -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dword s10, s[8:9], 0x18 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0xc +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: s_cmp_eq_u32 s10, 1 ; SI-NEXT: s_cselect_b32 s3, 0x40200000, s3 ; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: s_cmp_eq_u32 s8, 0 +; SI-NEXT: s_cmp_eq_u32 s10, 0 ; SI-NEXT: s_cselect_b32 s1, 0x40200000, s1 ; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -2059,16 +2059,16 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 ; ; VI-LABEL: dynamic_insertelement_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x60 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x30 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dword s10, s[8:9], 0x60 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x30 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s8, 1 +; VI-NEXT: s_cmp_eq_u32 s10, 1 ; VI-NEXT: s_cselect_b32 s3, 0x40200000, s3 ; VI-NEXT: s_cselect_b32 s2, 0, s2 -; VI-NEXT: s_cmp_eq_u32 s8, 0 +; VI-NEXT: s_cmp_eq_u32 s10, 0 ; VI-NEXT: s_cselect_b32 s1, 0x40200000, s1 ; VI-NEXT: s_cselect_b32 s0, 0, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2085,16 +2085,16 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dword s10, s[8:9], 0x8 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: s_cmp_eq_u32 s10, 1 ; SI-NEXT: s_cselect_b32 s3, 0, s3 ; SI-NEXT: s_cselect_b32 s2, 5, s2 -; SI-NEXT: s_cmp_eq_u32 s8, 0 +; SI-NEXT: s_cmp_eq_u32 s10, 0 ; SI-NEXT: s_cselect_b32 s1, 0, s1 ; SI-NEXT: s_cselect_b32 s0, 5, s0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -2106,16 +2106,16 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dword s10, s[8:9], 0x20 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s8, 1 +; VI-NEXT: s_cmp_eq_u32 s10, 1 ; VI-NEXT: s_cselect_b32 s3, 0, s3 ; VI-NEXT: s_cselect_b32 s2, 5, s2 -; VI-NEXT: s_cmp_eq_u32 s8, 0 +; VI-NEXT: s_cmp_eq_u32 s10, 0 ; VI-NEXT: s_cselect_b32 s1, 0, s1 ; VI-NEXT: s_cselect_b32 s0, 5, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2132,57 +2132,57 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s12, s[6:7], 0x10 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0xc +; SI-NEXT: s_load_dword s10, s[8:9], 0x10 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x8 +; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0xc ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s12, 1 +; SI-NEXT: s_cmp_eq_u32 s10, 1 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_cselect_b32 s6, 0, s11 -; SI-NEXT: s_cselect_b32 s7, 5, s10 -; SI-NEXT: s_cmp_eq_u32 s12, 0 -; SI-NEXT: s_cselect_b32 s9, 0, s9 -; SI-NEXT: s_cselect_b32 s8, 5, s8 -; SI-NEXT: s_cmp_eq_u32 s12, 2 +; SI-NEXT: s_cselect_b32 s7, 0, s7 +; SI-NEXT: s_cselect_b32 s6, 5, s6 +; SI-NEXT: s_cmp_eq_u32 s10, 0 ; SI-NEXT: s_cselect_b32 s5, 0, s5 ; SI-NEXT: s_cselect_b32 s4, 5, s4 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 +; SI-NEXT: s_cmp_eq_u32 s10, 2 +; SI-NEXT: s_cselect_b32 s9, 0, s9 +; SI-NEXT: s_cselect_b32 s8, 5, s8 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v3i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s12, s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x30 +; VI-NEXT: s_load_dword s10, s[8:9], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x30 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s12, 1 +; VI-NEXT: s_cmp_eq_u32 s10, 1 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_cselect_b32 s6, 0, s11 -; VI-NEXT: s_cselect_b32 s7, 5, s10 -; VI-NEXT: s_cmp_eq_u32 s12, 0 -; VI-NEXT: s_cselect_b32 s9, 0, s9 -; VI-NEXT: s_cselect_b32 s8, 5, s8 -; VI-NEXT: s_cmp_eq_u32 s12, 2 +; VI-NEXT: s_cselect_b32 s7, 0, s7 +; VI-NEXT: s_cselect_b32 s6, 5, s6 +; VI-NEXT: s_cmp_eq_u32 s10, 0 ; VI-NEXT: s_cselect_b32 s5, 0, s5 ; VI-NEXT: s_cselect_b32 s4, 5, s4 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 +; VI-NEXT: s_cmp_eq_u32 s10, 2 +; VI-NEXT: s_cselect_b32 s9, 0, s9 +; VI-NEXT: s_cselect_b32 s8, 5, s8 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x i64> %a, i64 5, i32 %b @@ -2193,68 +2193,68 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 x double> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x10 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s12, s[8:9], 0x10 +; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8 +; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s4, 1 -; SI-NEXT: s_cselect_b32 s5, 0x40200000, s11 -; SI-NEXT: s_cselect_b32 s6, 0, s10 -; SI-NEXT: s_cmp_eq_u32 s4, 0 -; SI-NEXT: s_cselect_b32 s7, 0x40200000, s9 -; SI-NEXT: s_cselect_b32 s8, 0, s8 -; SI-NEXT: s_cmp_eq_u32 s4, 3 -; SI-NEXT: s_cselect_b32 s9, 0x40200000, s15 -; SI-NEXT: s_cselect_b32 s10, 0, s14 -; SI-NEXT: s_cmp_eq_u32 s4, 2 -; SI-NEXT: s_cselect_b32 s4, 0x40200000, s13 -; SI-NEXT: s_cselect_b32 s11, 0, s12 -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: s_nop 0 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_cmp_eq_u32 s12, 1 +; SI-NEXT: s_cselect_b32 s3, 0x40200000, s3 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cmp_eq_u32 s12, 0 +; SI-NEXT: s_cselect_b32 s1, 0x40200000, s1 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cmp_eq_u32 s12, 3 +; SI-NEXT: s_cselect_b32 s7, 0x40200000, s7 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cmp_eq_u32 s12, 2 +; SI-NEXT: s_cselect_b32 s5, 0x40200000, s5 +; SI-NEXT: s_cselect_b32 s4, 0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s5 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x40 -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dword s12, s[8:9], 0x40 +; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s4, 1 -; VI-NEXT: s_cselect_b32 s5, 0x40200000, s11 -; VI-NEXT: s_cselect_b32 s6, 0, s10 -; VI-NEXT: s_cmp_eq_u32 s4, 0 -; VI-NEXT: s_cselect_b32 s7, 0x40200000, s9 -; VI-NEXT: s_cselect_b32 s8, 0, s8 -; VI-NEXT: s_cmp_eq_u32 s4, 3 -; VI-NEXT: s_cselect_b32 s9, 0x40200000, s15 -; VI-NEXT: s_cselect_b32 s10, 0, s14 -; VI-NEXT: s_cmp_eq_u32 s4, 2 -; VI-NEXT: s_cselect_b32 s4, 0x40200000, s13 -; VI-NEXT: s_cselect_b32 s11, 0, s12 -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_cmp_eq_u32 s12, 1 +; VI-NEXT: s_cselect_b32 s3, 0x40200000, s3 +; VI-NEXT: s_cselect_b32 s2, 0, s2 +; VI-NEXT: s_cmp_eq_u32 s12, 0 +; VI-NEXT: s_cselect_b32 s1, 0x40200000, s1 +; VI-NEXT: s_cselect_b32 s0, 0, s0 +; VI-NEXT: s_cmp_eq_u32 s12, 3 +; VI-NEXT: s_cselect_b32 s7, 0x40200000, s7 +; VI-NEXT: s_cselect_b32 s6, 0, s6 +; VI-NEXT: s_cmp_eq_u32 s12, 2 +; VI-NEXT: s_cselect_b32 s5, 0x40200000, s5 +; VI-NEXT: s_cselect_b32 s4, 0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x double> %a, double 8.0, i32 %b store <4 x double> %vecins, ptr addrspace(1) %out, align 16 @@ -2264,29 +2264,29 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 { ; SI-LABEL: dynamic_insertelement_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 -; SI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s4, s[8:9], 0x20 +; SI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x10 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b32 s4, s4, 1 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s19 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v13, s21 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: v_movreld_b32_e32 v0, 0 ; SI-NEXT: s_mov_b32 s2, -1 @@ -2299,29 +2299,29 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x80 +; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s4, s4, 1 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s16 -; VI-NEXT: v_mov_b32_e32 v9, s17 -; VI-NEXT: v_mov_b32_e32 v10, s18 -; VI-NEXT: v_mov_b32_e32 v11, s19 -; VI-NEXT: v_mov_b32_e32 v12, s20 -; VI-NEXT: v_mov_b32_e32 v13, s21 -; VI-NEXT: v_mov_b32_e32 v14, s22 -; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: v_mov_b32_e32 v6, s18 +; VI-NEXT: v_mov_b32_e32 v7, s19 +; VI-NEXT: v_mov_b32_e32 v8, s20 +; VI-NEXT: v_mov_b32_e32 v9, s21 +; VI-NEXT: v_mov_b32_e32 v10, s22 +; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v15, s27 ; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, 0 ; VI-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index c68138acc9b2bf..48a168b4bfbe71 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: s_insertelement_v2bf16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_insertelement_v2bf16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: s_insertelement_v2bf16_0: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -48,7 +48,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: s_insertelement_v2bf16_0: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -67,7 +67,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: s_insertelement_v2bf16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -81,7 +81,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_insertelement_v2bf16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -95,7 +95,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: s_insertelement_v2bf16_1: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -107,7 +107,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: s_insertelement_v2bf16_1: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -125,7 +125,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -142,7 +142,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v2bf16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -160,7 +160,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v2bf16_0: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x40a0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -173,7 +173,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v2bf16_0: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x40a0 @@ -197,7 +197,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_0_inlineimm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -214,7 +214,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2bf16_0_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -232,7 +232,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; GFX900-LABEL: v_insertelement_v2bf16_0_inlineimm: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dword v1, v0, s[2:3] @@ -244,7 +244,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; GFX940-LABEL: v_insertelement_v2bf16_0_inlineimm: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -267,7 +267,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -284,7 +284,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v2bf16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -302,7 +302,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v2bf16_1: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -315,7 +315,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v2bf16_1: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x5040100 @@ -339,7 +339,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_1_inlineimm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -356,7 +356,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2bf16_1_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -374,7 +374,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; GFX900-LABEL: v_insertelement_v2bf16_1_inlineimm: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -386,7 +386,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; GFX940-LABEL: v_insertelement_v2bf16_1_inlineimm: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x5040100 @@ -409,8 +409,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 { ; SI-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; SI-NEXT: s_mov_b32 s11, 0x100f000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -432,8 +432,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; VI-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -459,8 +459,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; GFX900-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dword v1, v0, s[4:5] @@ -477,21 +477,21 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; GFX940-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dword v1, v0, s[0:1] -; GFX940-NEXT: global_load_dword v2, v0, s[6:7] -; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: global_load_dword v1, v0, s[6:7] +; GFX940-NEXT: global_load_dword v2, v0, s[2:3] +; GFX940-NEXT: s_mov_b32 s2, 0xffff ; GFX940-NEXT: s_waitcnt vmcnt(1) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX940-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX940-NEXT: s_mov_b32 s0, 0x12341234 +; GFX940-NEXT: v_lshlrev_b32_e64 v1, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x12341234 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v1, v1, s0, v2 -; GFX940-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-NEXT: v_bfi_b32 v1, v1, s2, v2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -508,27 +508,28 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0xc -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s5, 0xffff -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dword s8, s[8:9], 0xc +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bfi_b32 v2, s5, v4, v2 +; SI-NEXT: v_bfi_b32 v2, s4, v4, v2 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4bf16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -546,8 +547,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_0: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX900-NEXT: s_load_dword s4, s[8:9], 0x30 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -560,17 +561,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v4bf16_0: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX940-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX940-NEXT: s_mov_b32 s1, 0xffff ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, s0 +; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v0, s1, v3, v0 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -587,17 +588,18 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dword s8, s[8:9], 0x4 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s4, s8, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 @@ -606,8 +608,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v4bf16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -625,8 +627,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_1: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX900-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -638,16 +640,16 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v4bf16_1: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX940-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_perm_b32 v0, s0, v0, v3 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 +; GFX940-NEXT: v_perm_b32 v0, s6, v0, v3 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -664,27 +666,28 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0xc -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s5, 0xffff -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dword s8, s[8:9], 0xc +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bfi_b32 v3, s5, v4, v3 +; SI-NEXT: v_bfi_b32 v3, s4, v4, v3 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4bf16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -702,8 +705,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_2: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX900-NEXT: s_load_dword s4, s[8:9], 0x30 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -716,17 +719,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v4bf16_2: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x30 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX940-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX940-NEXT: s_mov_b32 s1, 0xffff ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, s0 +; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v1, s1, v3, v1 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -743,17 +746,18 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dword s8, s[8:9], 0x4 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s4, s8, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, s4, v3 @@ -762,8 +766,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v4bf16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -781,8 +785,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_3: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX900-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -794,16 +798,16 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v4bf16_3: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX940-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_perm_b32 v1, s0, v1, v3 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 +; GFX940-NEXT: v_perm_b32 v1, s6, v1, v3 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -820,23 +824,24 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 { ; SI-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_lshl_b32 s6, s4, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s5, 4 -; SI-NEXT: s_or_b32 s6, s4, s6 -; SI-NEXT: s_lshl_b64 s[4:5], 0xffff, s5 -; SI-NEXT: v_mov_b32_e32 v4, s6 -; SI-NEXT: v_mov_b32_e32 v5, s6 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x4 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s4, s8, 16 +; SI-NEXT: s_and_b32 s5, s8, 0xffff +; SI-NEXT: s_lshl_b32 s6, s9, 4 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_lshl_b64 s[4:5], 0xffff, s6 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfi_b32 v3, s5, v4, v3 ; SI-NEXT: v_bfi_b32 v2, s4, v5, v2 @@ -845,8 +850,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -871,8 +876,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; GFX900-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -889,21 +894,21 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; GFX940-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX940-NEXT: s_lshl_b32 s1, s1, 4 -; GFX940-NEXT: s_pack_ll_b32_b16 s2, s0, s0 -; GFX940-NEXT: s_lshl_b64 s[0:1], 0xffff, s1 -; GFX940-NEXT: v_mov_b32_e32 v3, s2 -; GFX940-NEXT: v_mov_b32_e32 v4, s2 +; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX940-NEXT: s_lshl_b32 s2, s7, 4 +; GFX940-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX940-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, s4 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v1, s1, v3, v1 -; GFX940-NEXT: v_bfi_b32 v0, s0, v4, v0 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 +; GFX940-NEXT: v_bfi_b32 v1, s3, v3, v1 +; GFX940-NEXT: v_bfi_b32 v0, s2, v4, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -920,17 +925,18 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; SI-LABEL: v_insertelement_v8bf16_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v5, 0 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_load_dword s8, s[8:9], 0x4 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s4, s8, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -939,8 +945,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v8bf16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -959,8 +965,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v8bf16_3: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX900-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -972,16 +978,16 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v8bf16_3: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX940-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX940-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_perm_b32 v1, s0, v1, v5 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] sc0 sc1 +; GFX940-NEXT: v_perm_b32 v1, s6, v1, v5 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -998,48 +1004,49 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; SI-LABEL: v_insertelement_v8bf16_dynamic: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v5, 0 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 -; SI-NEXT: s_cmp_eq_u32 s5, 6 -; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x4 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_eq_u32 s9, 6 +; SI-NEXT: v_mov_b32_e32 v6, s8 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 7 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_cmp_eq_u32 s9, 7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 4 +; SI-NEXT: s_cmp_eq_u32 s9, 4 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 5 +; SI-NEXT: s_cmp_eq_u32 s9, 5 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 2 +; SI-NEXT: s_cmp_eq_u32 s9, 2 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: s_cmp_eq_u32 s9, 3 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v7, v3 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 0 +; SI-NEXT: s_cmp_eq_u32 s9, 0 ; SI-NEXT: v_or_b32_e32 v2, v2, v7 ; SI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 1 +; SI-NEXT: s_cmp_eq_u32 s9, 1 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1055,8 +1062,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v8bf16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1109,8 +1116,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; GFX900-LABEL: v_insertelement_v8bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -1154,48 +1161,48 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; GFX940-LABEL: v_insertelement_v8bf16_dynamic: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX940-NEXT: s_mov_b32 s2, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] -; GFX940-NEXT: s_cmp_eq_u32 s1, 6 -; GFX940-NEXT: v_mov_b32_e32 v5, s0 +; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GFX940-NEXT: s_cmp_eq_u32 s7, 6 +; GFX940-NEXT: v_mov_b32_e32 v5, s6 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 7 +; GFX940-NEXT: s_cmp_eq_u32 s7, 7 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc ; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 4 +; GFX940-NEXT: s_cmp_eq_u32 s7, 4 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 5 +; GFX940-NEXT: s_cmp_eq_u32 s7, 5 ; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 2 +; GFX940-NEXT: s_cmp_eq_u32 s7, 2 ; GFX940-NEXT: v_perm_b32 v3, v3, v6, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 3 +; GFX940-NEXT: s_cmp_eq_u32 s7, 3 ; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 0 +; GFX940-NEXT: s_cmp_eq_u32 s7, 0 ; GFX940-NEXT: v_perm_b32 v2, v6, v2, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 1 +; GFX940-NEXT: s_cmp_eq_u32 s7, 1 ; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc ; GFX940-NEXT: v_perm_b32 v1, v6, v1, s2 ; GFX940-NEXT: v_perm_b32 v0, v5, v0, s2 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1212,18 +1219,19 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; SI-LABEL: v_insertelement_v16bf16_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v9, 0 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[8:11], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[8:11], 0 addr64 offset:16 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_load_dword s8, s[8:9], 0x4 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s4, s8, 16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -1234,8 +1242,8 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_insertelement_v16bf16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1261,8 +1269,8 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; GFX900-LABEL: v_insertelement_v16bf16_3: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX900-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX900-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -1277,19 +1285,19 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; GFX940-LABEL: v_insertelement_v16bf16_3: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX940-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX940-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:16 +; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX940-NEXT: s_waitcnt vmcnt(1) -; GFX940-NEXT: v_perm_b32 v1, s0, v1, v9 +; GFX940-NEXT: v_perm_b32 v1, s6, v1, v9 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1306,8 +1314,8 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; SI-LABEL: v_insertelement_v16bf16_dynamic: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; SI-NEXT: s_mov_b32 s11, 0x100f000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 @@ -1406,8 +1414,8 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; VI-LABEL: v_insertelement_v16bf16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -1503,8 +1511,8 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; GFX900-LABEL: v_insertelement_v16bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] @@ -1583,83 +1591,83 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; GFX940-LABEL: v_insertelement_v16bf16_dynamic: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX940-NEXT: s_mov_b32 s2, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:16 -; GFX940-NEXT: s_cmp_eq_u32 s1, 6 -; GFX940-NEXT: v_mov_b32_e32 v9, s0 +; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 +; GFX940-NEXT: s_cmp_eq_u32 s7, 6 +; GFX940-NEXT: v_mov_b32_e32 v9, s6 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 7 +; GFX940-NEXT: s_cmp_eq_u32 s7, 7 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 ; GFX940-NEXT: s_waitcnt vmcnt(1) ; GFX940-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc ; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 4 +; GFX940-NEXT: s_cmp_eq_u32 s7, 4 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 5 +; GFX940-NEXT: s_cmp_eq_u32 s7, 5 ; GFX940-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 2 +; GFX940-NEXT: s_cmp_eq_u32 s7, 2 ; GFX940-NEXT: v_perm_b32 v3, v3, v10, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 3 +; GFX940-NEXT: s_cmp_eq_u32 s7, 3 ; GFX940-NEXT: v_lshrrev_b32_e32 v12, 16, v1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 0 +; GFX940-NEXT: s_cmp_eq_u32 s7, 0 ; GFX940-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 1 +; GFX940-NEXT: s_cmp_eq_u32 s7, 1 ; GFX940-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 14 +; GFX940-NEXT: s_cmp_eq_u32 s7, 14 ; GFX940-NEXT: v_perm_b32 v1, v10, v1, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 15 +; GFX940-NEXT: s_cmp_eq_u32 s7, 15 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; GFX940-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 12 +; GFX940-NEXT: s_cmp_eq_u32 s7, 12 ; GFX940-NEXT: v_perm_b32 v0, v10, v0, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 13 +; GFX940-NEXT: s_cmp_eq_u32 s7, 13 ; GFX940-NEXT: v_lshrrev_b32_e32 v15, 16, v6 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 10 +; GFX940-NEXT: s_cmp_eq_u32 s7, 10 ; GFX940-NEXT: v_perm_b32 v7, v10, v7, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 11 +; GFX940-NEXT: s_cmp_eq_u32 s7, 11 ; GFX940-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; GFX940-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 8 +; GFX940-NEXT: s_cmp_eq_u32 s7, 8 ; GFX940-NEXT: v_perm_b32 v6, v10, v6, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 9 +; GFX940-NEXT: s_cmp_eq_u32 s7, 9 ; GFX940-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX940-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc ; GFX940-NEXT: v_perm_b32 v5, v10, v5, s2 ; GFX940-NEXT: v_perm_b32 v4, v9, v4, s2 -; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index a5e0e5fdcb9a0d..d09af8fd2ac954 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -19,7 +19,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2i16_0: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2i16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -52,8 +52,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -65,8 +65,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; ; VI-LABEL: s_insertelement_v2i16_0_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -81,8 +81,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; ; CI-LABEL: s_insertelement_v2i16_0_reg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -98,15 +98,15 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; GFX11-LABEL: s_insertelement_v2i16_0_reg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_pack_lh_b32_b16 s0, s0, s1 +; GFX11-NEXT: s_pack_lh_b32_b16 s2, s4, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 @@ -117,8 +117,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -134,8 +134,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; ; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -154,8 +154,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; ; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -175,18 +175,18 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; GFX11-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s4, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use s1 +; GFX11-NEXT: ; use s2 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr @@ -201,8 +201,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %elt.arg) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -214,8 +214,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; VI-LABEL: s_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -229,8 +229,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; CI-LABEL: s_insertelement_v2i16_0_reghi: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -245,15 +245,15 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; GFX11-LABEL: s_insertelement_v2i16_0_reghi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_pack_hh_b32_b16 s0, s0, s1 +; GFX11-NEXT: s_pack_hh_b32_b16 s2, s4, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 @@ -266,8 +266,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -283,8 +283,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; ; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -302,8 +302,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; ; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -322,18 +322,18 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s3, s4, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_pack_lh_b32_b16 s1, s0, s1 +; GFX11-NEXT: s_pack_lh_b32_b16 s2, s3, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use s0 +; GFX11-NEXT: ; use s3 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr @@ -349,8 +349,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -370,8 +370,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; ; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -392,8 +392,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; ; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -415,22 +415,22 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s3, s4, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s3, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use s0 +; GFX11-NEXT: ; use s3 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use s1 +; GFX11-NEXT: ; use s2 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr @@ -450,7 +450,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -462,7 +462,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2i16_1: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -476,7 +476,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2i16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -494,8 +494,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { ; GFX9-LABEL: s_insertelement_v2i16_1_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -507,8 +507,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; ; VI-LABEL: s_insertelement_v2i16_1_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -523,8 +523,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; ; CI-LABEL: s_insertelement_v2i16_1_reg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -540,15 +540,15 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; GFX11-LABEL: s_insertelement_v2i16_1_reg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 @@ -559,7 +559,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -572,7 +572,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2f16_0: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -586,7 +586,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2f16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -605,7 +605,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -617,7 +617,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2f16_1: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -631,7 +631,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2f16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -649,7 +649,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -662,7 +662,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2i16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -680,7 +680,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2i16_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -698,7 +698,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2i16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -722,8 +722,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %elt.arg) #0 { ; GFX9-LABEL: v_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -735,8 +735,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -754,8 +754,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v2i16_0_reghi: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -773,17 +773,16 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_insertelement_v2i16_0_reghi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v1, v1, s0, 0x7060302 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: v_perm_b32 v1, v1, s4, 0x7060302 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -800,7 +799,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -812,7 +811,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2i16_0_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -830,7 +829,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2i16_0_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -848,7 +847,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2i16_0_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -872,7 +871,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -885,7 +884,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2i16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -903,7 +902,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2i16_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -921,7 +920,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2i16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -945,7 +944,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -957,7 +956,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2i16_1_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -975,7 +974,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2i16_1_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -993,7 +992,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2i16_1_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1016,7 +1015,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1029,7 +1028,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1047,7 +1046,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2f16_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1065,7 +1064,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2f16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1089,7 +1088,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1101,7 +1100,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2f16_0_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1119,7 +1118,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2f16_0_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1137,7 +1136,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2f16_0_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1160,7 +1159,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1173,7 +1172,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1191,7 +1190,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2f16_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1209,7 +1208,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2f16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1233,7 +1232,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1245,7 +1244,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2f16_1_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1263,7 +1262,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2f16_1_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1281,7 +1280,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2f16_1_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1305,8 +1304,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(4) %idx.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1323,8 +1322,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: s_insertelement_v2i16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1342,8 +1341,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: s_insertelement_v2i16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1362,10 +1361,10 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; GFX11-LABEL: s_insertelement_v2i16_dynamic: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s3, s4, 4 @@ -1388,8 +1387,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 { ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1403,8 +1402,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1424,8 +1423,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1445,19 +1444,18 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_lshl_b32 s2, s4, 4 +; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, s0, 0x3e703e7, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: v_bfi_b32 v1, s2, 0x3e703e7, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1472,8 +1470,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 { ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -1490,8 +1488,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1517,8 +1515,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -1544,14 +1542,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; GFX11-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 @@ -1576,8 +1574,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -1590,8 +1588,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1609,8 +1607,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1629,17 +1627,16 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s0, v0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s4, v0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1656,8 +1653,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1669,8 +1666,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1688,8 +1685,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1708,17 +1705,16 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_perm_b32 v0, s4, v0, 0x5040100 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1735,8 +1731,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -1749,8 +1745,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4f16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1768,8 +1764,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1788,17 +1784,16 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1815,8 +1810,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1828,8 +1823,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4f16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1847,8 +1842,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_3: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1867,17 +1862,16 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1894,8 +1888,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4i16_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -1908,8 +1902,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4i16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1927,8 +1921,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4i16_2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1947,17 +1941,16 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4i16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1975,8 +1968,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: global_load_dword v2, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -1994,8 +1987,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2021,8 +2014,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: flat_load_dword v4, v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2048,24 +2041,23 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: global_load_b32 v2, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s4, s4 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 0xffff ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v1, v3, s0, v1 -; GFX11-NEXT: v_bfi_b32 v0, v2, s0, v0 -; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX11-NEXT: v_bfi_b32 v1, v3, s2, v1 +; GFX11-NEXT: v_bfi_b32 v0, v2, s2, v0 +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2083,8 +2075,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 { ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -2101,8 +2093,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2127,8 +2119,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2153,21 +2145,20 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] -; GFX11-NEXT: s_lshl_b32 s1, s1, 4 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s0 -; GFX11-NEXT: s_lshl_b64 s[0:1], 0xffff, s1 +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_lshl_b32 s2, s5, 4 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX11-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, s1, s2, v1 -; GFX11-NEXT: v_bfi_b32 v0, s0, s2, v0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_bfi_b32 v1, s3, s4, v1 +; GFX11-NEXT: v_bfi_b32 v0, s2, s4, v0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2184,8 +2175,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v8f16_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2197,8 +2188,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v8f16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2217,8 +2208,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v8f16_3: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2237,17 +2228,16 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v8f16_3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2264,8 +2254,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v8i16_6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -2278,8 +2268,8 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v8i16_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2298,8 +2288,8 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v8i16_6: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2318,17 +2308,16 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v8i16_6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s0, v3 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2345,8 +2334,8 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v8f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -2390,8 +2379,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v8f16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2444,8 +2433,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v8f16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2514,49 +2503,48 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_insertelement_v8f16_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] -; GFX11-NEXT: s_cmp_eq_u32 s1, 6 +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX11-NEXT: s_cmp_eq_u32 s5, 6 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 7 +; GFX11-NEXT: s_cmp_eq_u32 s5, 7 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 4 +; GFX11-NEXT: s_cmp_eq_u32 s5, 4 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 5 +; GFX11-NEXT: s_cmp_eq_u32 s5, 5 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 2 +; GFX11-NEXT: s_cmp_eq_u32 s5, 2 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 3 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 +; GFX11-NEXT: s_cmp_eq_u32 s5, 3 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s3 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s4, s2 ; GFX11-NEXT: v_perm_b32 v3, v3, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 ; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 ; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2573,8 +2561,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v16f16_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2589,8 +2577,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v16f16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2616,8 +2604,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; ; CI-LABEL: v_insertelement_v16f16_3: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s3 @@ -2643,22 +2631,21 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_insertelement_v16f16_3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] -; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 +; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2675,8 +2662,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v16i16_6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] @@ -2692,8 +2679,8 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v16i16_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2718,8 +2705,8 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; ; CI-LABEL: v_insertelement_v16i16_6: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2745,22 +2732,21 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_insertelement_v16i16_6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] -; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 +; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s0, v3 +; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -2777,8 +2763,8 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v16f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] @@ -2857,8 +2843,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v16f16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2954,8 +2940,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v16f16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3084,85 +3070,84 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_insertelement_v16f16_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] -; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 -; GFX11-NEXT: s_cmp_eq_u32 s1, 6 +; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 +; GFX11-NEXT: s_cmp_eq_u32 s5, 6 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 7 +; GFX11-NEXT: s_cmp_eq_u32 s5, 7 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 4 +; GFX11-NEXT: s_cmp_eq_u32 s5, 4 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 5 +; GFX11-NEXT: s_cmp_eq_u32 s5, 5 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 2 +; GFX11-NEXT: s_cmp_eq_u32 s5, 2 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 3 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 +; GFX11-NEXT: s_cmp_eq_u32 s5, 3 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX11-NEXT: s_cmp_eq_u32 s5, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 14 +; GFX11-NEXT: s_cmp_eq_u32 s5, 14 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s4, s3 ; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 15 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2 +; GFX11-NEXT: s_cmp_eq_u32 s5, 15 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 12 +; GFX11-NEXT: s_cmp_eq_u32 s5, 12 ; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 ; GFX11-NEXT: v_perm_b32 v2, v10, v2, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 13 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s2 +; GFX11-NEXT: s_cmp_eq_u32 s5, 13 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 10 +; GFX11-NEXT: s_cmp_eq_u32 s5, 10 ; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 11 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s2 +; GFX11-NEXT: s_cmp_eq_u32 s5, 11 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 8 +; GFX11-NEXT: s_cmp_eq_u32 s5, 8 ; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s4, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 9 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s4, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 9 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s0, s2 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 ; GFX11-NEXT: v_perm_b32 v7, v10, v7, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s4, s2 ; GFX11-NEXT: v_perm_b32 v6, v12, v6, 0x5040100 ; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 ; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 ; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 ; GFX11-NEXT: v_perm_b32 v4, v14, v4, 0x5040100 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index b21b2adbcba951..fbeda72725b2a4 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -427,127 +427,127 @@ entry: define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s0, 0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_sub_i32 s4, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 -; GFX9-NEXT: s_mul_i32 s1, s0, s7 -; GFX9-NEXT: s_sub_i32 s1, s6, s1 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_sub_i32 s3, s1, s7 -; GFX9-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-NEXT: s_mul_i32 s5, s4, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_add_i32 s6, s4, 1 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_add_i32 s5, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_i32: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX90A-NEXT: s_sub_i32 s0, 0, s7 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_sub_i32 s4, 0, s3 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s1, v0 -; GFX90A-NEXT: s_mul_i32 s0, s0, s1 -; GFX90A-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX90A-NEXT: s_add_i32 s1, s1, s0 -; GFX90A-NEXT: s_mul_hi_u32 s0, s6, s1 -; GFX90A-NEXT: s_mul_i32 s1, s0, s7 -; GFX90A-NEXT: s_sub_i32 s1, s6, s1 -; GFX90A-NEXT: s_add_i32 s2, s0, 1 -; GFX90A-NEXT: s_sub_i32 s3, s1, s7 -; GFX90A-NEXT: s_cmp_ge_u32 s1, s7 -; GFX90A-NEXT: s_cselect_b32 s0, s2, s0 -; GFX90A-NEXT: s_cselect_b32 s1, s3, s1 -; GFX90A-NEXT: s_add_i32 s2, s0, 1 -; GFX90A-NEXT: s_cmp_ge_u32 s1, s7 -; GFX90A-NEXT: s_cselect_b32 s0, s2, s0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NEXT: global_store_dword v1, v0, s[4:5] +; GFX90A-NEXT: v_readfirstlane_b32 s5, v0 +; GFX90A-NEXT: s_mul_i32 s4, s4, s5 +; GFX90A-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX90A-NEXT: s_add_i32 s5, s5, s4 +; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX90A-NEXT: s_mul_i32 s5, s4, s3 +; GFX90A-NEXT: s_sub_i32 s2, s2, s5 +; GFX90A-NEXT: s_add_i32 s6, s4, 1 +; GFX90A-NEXT: s_sub_i32 s5, s2, s3 +; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 +; GFX90A-NEXT: s_cselect_b32 s4, s6, s4 +; GFX90A-NEXT: s_cselect_b32 s2, s5, s2 +; GFX90A-NEXT: s_add_i32 s5, s4, 1 +; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 +; GFX90A-NEXT: s_cselect_b32 s2, s5, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: udiv_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s1, 0, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX10-NEXT: s_sub_i32 s5, 0, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s1, s1, s0 -; GFX10-NEXT: s_mul_hi_u32 s1, s0, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_mul_hi_u32 s0, s6, s0 -; GFX10-NEXT: s_mul_i32 s1, s0, s7 -; GFX10-NEXT: s_add_i32 s2, s0, 1 -; GFX10-NEXT: s_sub_i32 s1, s6, s1 -; GFX10-NEXT: s_sub_i32 s3, s1, s7 -; GFX10-NEXT: s_cmp_ge_u32 s1, s7 -; GFX10-NEXT: s_cselect_b32 s0, s2, s0 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_add_i32 s2, s0, 1 -; GFX10-NEXT: s_cmp_ge_u32 s1, s7 -; GFX10-NEXT: s_cselect_b32 s0, s2, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_mul_i32 s5, s5, s4 +; GFX10-NEXT: s_mul_hi_u32 s5, s4, s5 +; GFX10-NEXT: s_add_i32 s4, s4, s5 +; GFX10-NEXT: s_mul_hi_u32 s4, s2, s4 +; GFX10-NEXT: s_mul_i32 s5, s4, s3 +; GFX10-NEXT: s_sub_i32 s2, s2, s5 +; GFX10-NEXT: s_add_i32 s5, s4, 1 +; GFX10-NEXT: s_sub_i32 s6, s2, s3 +; GFX10-NEXT: s_cmp_ge_u32 s2, s3 +; GFX10-NEXT: s_cselect_b32 s4, s5, s4 +; GFX10-NEXT: s_cselect_b32 s2, s6, s2 +; GFX10-NEXT: s_add_i32 s5, s4, 1 +; GFX10-NEXT: s_cmp_ge_u32 s2, s3 +; GFX10-NEXT: s_cselect_b32 s2, s5, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX9-FLATSCR-LABEL: udiv_i32: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-FLATSCR-NEXT: s_sub_i32 s0, 0, s7 +; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-FLATSCR-NEXT: s_sub_i32 s4, 0, s3 ; GFX9-FLATSCR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-FLATSCR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-FLATSCR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX9-FLATSCR-NEXT: s_add_i32 s1, s1, s0 -; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s0, s6, s1 -; GFX9-FLATSCR-NEXT: s_mul_i32 s1, s0, s7 -; GFX9-FLATSCR-NEXT: s_sub_i32 s1, s6, s1 -; GFX9-FLATSCR-NEXT: s_add_i32 s2, s0, 1 -; GFX9-FLATSCR-NEXT: s_sub_i32 s3, s1, s7 -; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-FLATSCR-NEXT: s_add_i32 s2, s0, 1 -; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-FLATSCR-NEXT: s_add_i32 s5, s5, s4 +; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-FLATSCR-NEXT: s_mul_i32 s5, s4, s3 +; GFX9-FLATSCR-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-FLATSCR-NEXT: s_add_i32 s6, s4, 1 +; GFX9-FLATSCR-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-FLATSCR-NEXT: s_add_i32 s5, s4, 1 +; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: s_endpgm ; ; GFX11-LABEL: udiv_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX11-NEXT: s_sub_i32 s5, 0, s3 @@ -584,7 +584,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX12-LABEL: udiv_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cvt_f32_u32 s4, s3 ; GFX12-NEXT: s_sub_co_i32 s5, 0, s3 @@ -696,10 +696,10 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-NEXT: s_mul_i32 s0, s0, 5 @@ -716,10 +716,10 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB5_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX90A-NEXT: s_mul_i32 s0, s0, 5 @@ -738,7 +738,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB5_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dword s1, s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s1, s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX10-NEXT: s_mul_i32 s0, s0, 5 @@ -756,10 +756,10 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, 5 @@ -779,7 +779,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB5_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -800,7 +800,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB5_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 @@ -892,25 +892,25 @@ define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) { define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrspace(3) %local) { ; GFX9-LABEL: atomic_add_ret_local: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB7_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -921,25 +921,25 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX90A-LABEL: atomic_add_ret_local: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB7_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX90A-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX90A-NEXT: s_mul_i32 s4, s4, 5 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: s_mul_i32 s2, s2, 5 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB7_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -957,19 +957,19 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB7_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX10-NEXT: s_mul_i32 s1, s1, 5 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB7_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -980,25 +980,25 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX9-FLATSCR-LABEL: atomic_add_ret_local: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_mov_b64 s[4:5], exec -; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec +; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-FLATSCR-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-FLATSCR-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB7_2: ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -1017,18 +1017,18 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB7_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_i32 s1, s1, 5 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB7_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -1048,21 +1048,21 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB7_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mul_i32 s1, s1, 5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB7_2: ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 @@ -1084,24 +1084,24 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1112,24 +1112,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX90A-LABEL: add_i32_constant: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB8_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX90A-NEXT: s_mul_i32 s4, s4, 5 -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: s_mul_i32 s2, s2, 5 +; GFX90A-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB8_2: ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -1147,17 +1147,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB8_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX10-NEXT: s_mul_i32 s1, s1, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc +; GFX10-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB8_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1168,24 +1168,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-FLATSCR-LABEL: add_i32_constant: ; GFX9-FLATSCR: ; %bb.0: ; %entry -; GFX9-FLATSCR-NEXT: s_mov_b64 s[4:5], exec -; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec +; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-FLATSCR-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB8_2: ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -1204,17 +1204,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_i32 s1, s1, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB8_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -1234,7 +1234,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB8_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 @@ -1242,12 +1242,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: s_mul_i32 s1, s1, 5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v1, s1 -; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: .LBB8_2: ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index f9073be7e260b8..8704f4e780448b 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -8,11 +8,11 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind { ; SI-LABEL: i8_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_and_b32 s4, s2, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -20,10 +20,10 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; VI-LABEL: i8_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xff +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -32,8 +32,8 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; GFX9-LABEL: i8_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -80,11 +80,11 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind { ; SI-LABEL: i8_zext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_and_b32 s4, s2, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -92,10 +92,10 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; VI-LABEL: i8_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xff +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -104,8 +104,8 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; GFX9-LABEL: i8_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -155,11 +155,11 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind { ; SI-LABEL: i8_sext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i8 s4, s4 +; SI-NEXT: s_sext_i32_i8 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -167,10 +167,10 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; VI-LABEL: i8_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i8 s2, s4 +; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -179,8 +179,8 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; GFX9-LABEL: i8_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -230,11 +230,11 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind { ; SI-LABEL: i16_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s2, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -242,10 +242,10 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; VI-LABEL: i16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xffff +; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -254,8 +254,8 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; GFX9-LABEL: i16_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -302,11 +302,11 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind { ; SI-LABEL: i16_zext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s2, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -314,10 +314,10 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; VI-LABEL: i16_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xffff +; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -326,8 +326,8 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; GFX9-LABEL: i16_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -377,11 +377,11 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind { ; SI-LABEL: i16_sext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i16 s4, s4 +; SI-NEXT: s_sext_i32_i16 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -389,10 +389,10 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; VI-LABEL: i16_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s2, s4 +; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -401,8 +401,8 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; GFX9-LABEL: i16_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -452,19 +452,19 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind { ; SI-LABEL: i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -474,8 +474,8 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou ; ; GFX9-LABEL: i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -511,19 +511,19 @@ entry: define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind { ; SI-LABEL: f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -533,8 +533,8 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n ; ; GFX9-LABEL: f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -570,19 +570,19 @@ entry: define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; SI-LABEL: v2i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v2i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -592,8 +592,8 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; ; GFX9-LABEL: v2i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -659,19 +659,19 @@ entry: define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; SI-LABEL: v2i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v2i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -681,8 +681,8 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; ; GFX9-LABEL: v2i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -718,7 +718,7 @@ entry: define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind { ; SI-LABEL: v2i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -731,7 +731,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; VI-LABEL: v2i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -742,7 +742,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; GFX9-LABEL: v2i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -781,7 +781,7 @@ entry: define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind { ; SI-LABEL: v2f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -794,7 +794,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; VI-LABEL: v2f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -805,7 +805,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; GFX9-LABEL: v2f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -844,40 +844,40 @@ entry: define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { ; SI-LABEL: v3i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: s_lshr_b32 s4, s6, 16 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v3i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 +; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 2 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_byte v[2:3], v5 ; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -983,7 +983,7 @@ entry: define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { ; SI-LABEL: v3i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -998,7 +998,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; VI-LABEL: v3i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 @@ -1014,7 +1014,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; GFX9-LABEL: v3i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1102,36 +1102,36 @@ entry: define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { ; SI-LABEL: v3i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v3i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1181,36 +1181,36 @@ entry: define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { ; SI-LABEL: v3f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v3f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1260,19 +1260,19 @@ entry: define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; SI-LABEL: v4i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v4i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1282,8 +1282,8 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; ; GFX9-LABEL: v4i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1319,7 +1319,7 @@ entry: define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; SI-LABEL: v4i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; VI-LABEL: v4i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -1343,7 +1343,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; GFX9-LABEL: v4i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1382,26 +1382,26 @@ entry: define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind { ; SI-LABEL: v4i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v4i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1410,8 +1410,8 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; GFX9-LABEL: v4i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1456,26 +1456,26 @@ entry: define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind { ; SI-LABEL: v4f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v4f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1484,8 +1484,8 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; GFX9-LABEL: v4f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1530,7 +1530,7 @@ entry: define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind { ; SI-LABEL: v5i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1545,7 +1545,7 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; VI-LABEL: v5i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; GFX9-LABEL: v5i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1671,9 +1671,9 @@ entry: define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind { ; SI-LABEL: v5i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s6, s[4:5], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1687,14 +1687,14 @@ define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> ; ; VI-LABEL: v5i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s5, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 8 -; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_short v[2:3], v4 @@ -1706,8 +1706,8 @@ define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> ; ; GFX9-LABEL: v5i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s2 @@ -1902,9 +1902,9 @@ entry: define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> %in) nounwind { ; SI-LABEL: v5i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 +; SI-NEXT: s_load_dword s8, s[4:5], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1920,19 +1920,19 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; ; VI-LABEL: v5i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s7, s[2:3], 0x54 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s6, s4, 16 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_addc_u32 s7, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_add_u32 s4, s6, 16 +; VI-NEXT: s_addc_u32 s5, s7, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1941,12 +1941,12 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; ; GFX9-LABEL: v5i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s8, s[6:7], 0x30 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[8:9], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -2000,9 +2000,9 @@ entry: define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float> %in) nounwind { ; SI-LABEL: v5f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 +; SI-NEXT: s_load_dword s8, s[4:5], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2018,31 +2018,31 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float ; ; VI-LABEL: v5f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s7, s[2:3], 0x54 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s6, s4, 16 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s7, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_add_u32 s4, s6, 16 +; VI-NEXT: s_addc_u32 s5, s7, 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v3, s8 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_dword v[1:2], v3 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s6, s[8:9], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2099,34 +2099,34 @@ entry: define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> %in) nounwind { ; SI-LABEL: v5i64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x21 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x21 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x84 -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -2155,25 +2155,25 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> ; ; GFX9-LABEL: v5i64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x60 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x60 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[2:3] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[12:13] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i64_arg: @@ -2241,34 +2241,34 @@ entry: define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind { ; SI-LABEL: v5f64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x21 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x21 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5f64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x84 -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -2297,25 +2297,25 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; ; GFX9-LABEL: v5f64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x60 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x60 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[2:3] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[12:13] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5f64_arg: @@ -2384,7 +2384,7 @@ entry: define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; SI-LABEL: v8i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2397,7 +2397,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; VI-LABEL: v8i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -2408,7 +2408,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; GFX9-LABEL: v8i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2635,26 +2635,26 @@ entry: define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; SI-LABEL: v8i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v8i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -2663,8 +2663,8 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; GFX9-LABEL: v8i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2883,64 +2883,64 @@ entry: define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { ; SI-LABEL: v8i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v8i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v8i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v8i32_arg: @@ -2994,64 +2994,64 @@ entry: define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind { ; SI-LABEL: v8f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v8f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v8f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v8f32_arg: @@ -3106,26 +3106,26 @@ entry: define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; SI-LABEL: v16i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v16i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -3134,8 +3134,8 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; GFX9-LABEL: v16i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -3556,64 +3556,64 @@ entry: define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; SI-LABEL: v16i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v16i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v3, s15 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v16i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v16i16_arg: @@ -4012,105 +4012,105 @@ entry: define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind { ; SI-LABEL: v16i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_mov_b32_e32 v1, s21 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: v_mov_b32_e32 v3, s23 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_mov_b32_e32 v3, s15 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v16i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v0, s20 ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_mov_b32_e32 v1, s21 +; VI-NEXT: v_mov_b32_e32 v2, s22 +; VI-NEXT: v_mov_b32_e32 v3, s23 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v16i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s20 ; GFX9-NEXT: v_mov_b32_e32 v1, s21 ; GFX9-NEXT: v_mov_b32_e32 v2, s22 ; GFX9-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -4200,105 +4200,105 @@ entry: define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind { ; SI-LABEL: v16f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_mov_b32_e32 v1, s21 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: v_mov_b32_e32 v3, s23 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_mov_b32_e32 v3, s15 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v16f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v0, s20 ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_mov_b32_e32 v1, s21 +; VI-NEXT: v_mov_b32_e32 v2, s22 +; VI-NEXT: v_mov_b32_e32 v3, s23 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v16f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s20 ; GFX9-NEXT: v_mov_b32_e32 v1, s21 ; GFX9-NEXT: v_mov_b32_e32 v2, s22 ; GFX9-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -4388,7 +4388,7 @@ entry: define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind { ; SI-LABEL: kernel_arg_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4401,7 +4401,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; VI-LABEL: kernel_arg_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -4412,7 +4412,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; GFX9-LABEL: kernel_arg_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4450,7 +4450,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; SI-LABEL: f64_kernel_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4463,7 +4463,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; VI-LABEL: f64_kernel_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -4474,7 +4474,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; GFX9-LABEL: f64_kernel_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4522,11 +4522,11 @@ entry: define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind { ; SI-LABEL: i65_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s8, s4, 1 +; SI-NEXT: s_and_b32 s8, s6, 1 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 @@ -4539,10 +4539,10 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; VI-LABEL: i65_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s4, 1 +; VI-NEXT: s_and_b32 s4, s6, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 8 @@ -4558,8 +4558,8 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; GFX9-LABEL: i65_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s4, 1 @@ -4640,11 +4640,11 @@ entry: define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 1 +; SI-NEXT: s_and_b32 s4, s2, 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -4652,10 +4652,10 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; VI-LABEL: i1_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4664,8 +4664,8 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; GFX9-LABEL: i1_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4731,11 +4731,11 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_zext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 1 +; SI-NEXT: s_and_b32 s4, s2, 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4743,10 +4743,10 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_zext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4755,8 +4755,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_zext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4803,12 +4803,12 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_zext_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 1 +; SI-NEXT: s_and_b32 s4, s6, 1 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -4816,11 +4816,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_zext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -4829,8 +4829,8 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_zext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4879,11 +4879,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_sext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s4, s4, 0x10000 +; SI-NEXT: s_bfe_i32 s4, s2, 0x10000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4891,10 +4891,10 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_sext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s2, s4, 0x10000 +; VI-NEXT: s_bfe_i32 s2, s2, 0x10000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4903,8 +4903,8 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_sext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000 @@ -4953,11 +4953,11 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_sext_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -4966,8 +4966,8 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_sext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -4979,8 +4979,8 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_sext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 @@ -5062,10 +5062,10 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { ; SI-LABEL: struct_argument_alignment: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb -; SI-NEXT: s_load_dword s9, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x11 +; SI-NEXT: s_load_dword s8, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xb +; SI-NEXT: s_load_dword s9, s[4:5], 0xf +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -5074,36 +5074,36 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: struct_argument_alignment: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s5, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x44 +; VI-NEXT: s_load_dword s6, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; VI-NEXT: s_load_dword s7, s[4:5], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x44 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -5114,10 +5114,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; ; GFX9-LABEL: struct_argument_alignment: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX9-NEXT: s_load_dword s5, s[6:7], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x20 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GFX9-NEXT: s_load_dword s5, s[8:9], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x20 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5196,25 +5196,24 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { ; SI-LABEL: packed_struct_argument_alignment: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dword s6, s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xa -; SI-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:49 -; SI-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:50 -; SI-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:51 -; SI-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:52 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:53 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s2, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa +; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49 +; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50 +; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51 +; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_mov_b32_e32 v3, s5 -; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 @@ -5222,45 +5221,45 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; SI-NEXT: v_or_b32_e32 v3, v3, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: packed_struct_argument_alignment: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s2, 49 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: s_add_u32 s4, s2, 50 -; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: s_add_u32 s0, s4, 49 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: s_add_u32 s2, s4, 50 +; VI-NEXT: s_addc_u32 s3, s5, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 3 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_add_u32 s0, s2, 51 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_add_u32 s0, s4, 51 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v7, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: s_add_u32 s0, s2, 53 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s4, 53 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x28 +; VI-NEXT: s_load_dword s2, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dword v[2:3], v7 @@ -5281,10 +5280,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; GFX9-LABEL: packed_struct_argument_alignment: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_dword v6, v2, s[6:7] offset:13 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] offset:17 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x4 +; GFX9-NEXT: global_load_dword v6, v2, s[8:9] offset:13 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[8:9] offset:17 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5380,11 +5379,11 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { ; SI-LABEL: struct_argument_alignment_after: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s12, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb -; SI-NEXT: s_load_dword s13, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x15 +; SI-NEXT: s_load_dword s12, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xb +; SI-NEXT: s_load_dword s13, s[4:5], 0xf +; SI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -5414,26 +5413,26 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; ; VI-LABEL: struct_argument_alignment_after: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; VI-NEXT: s_load_dword s9, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x54 +; VI-NEXT: s_load_dword s10, s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c +; VI-NEXT: s_load_dword s11, s[4:5], 0x3c +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v0, s10 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -5446,11 +5445,11 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; ; GFX9-LABEL: struct_argument_alignment_after: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x30 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5464,8 +5463,8 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -5546,7 +5545,7 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; SI-LABEL: array_3xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5566,7 +5565,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; VI-LABEL: array_3xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5584,7 +5583,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; GFX9-LABEL: array_3xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -5660,43 +5659,42 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; SI-LABEL: array_3xi16: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:42 -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:40 -; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:38 +; SI-NEXT: s_load_dword s0, s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:42 +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:40 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:38 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v3, s0 +; SI-NEXT: buffer_store_byte v3, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_short v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v2, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: array_3xi16: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s2, 38 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_add_u32 s0, s4, 38 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: s_add_u32 s2, s0, 2 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_add_u32 s0, s2, 42 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s4, 42 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_load_ushort v4, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5713,10 +5711,10 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; GFX9-LABEL: array_3xi16: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] offset:6 -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:4 -; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] offset:2 -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: global_load_ushort v1, v0, s[8:9] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] offset:4 +; GFX9-NEXT: global_load_ushort v3, v0, s[8:9] offset:2 +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -5831,19 +5829,18 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; SI-LABEL: small_array_round_down_offset: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:37 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: small_array_round_down_offset: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s2, 37 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s4, 37 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_ubyte v0, v[0:1] @@ -5855,7 +5852,7 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; GFX9-LABEL: small_array_round_down_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[8:9] offset:1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5889,23 +5886,23 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { ; SI-LABEL: byref_align_constant_i32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x49 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x49 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: byref_align_constant_i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x124 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x124 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -5919,8 +5916,8 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; GFX9-LABEL: byref_align_constant_i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x100 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x100 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -5973,53 +5970,63 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { ; SI-LABEL: byref_natural_align_constant_v16i32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s20, s[2:3], 0x29 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0x29 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_mov_b32_e32 v1, s21 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: v_mov_b32_e32 v3, s23 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_mov_b32_e32 v3, s15 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: byref_natural_align_constant_v16i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s20, s[2:3], 0xa4 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0xa4 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v0, s20 ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_mov_b32_e32 v1, s21 +; VI-NEXT: v_mov_b32_e32 v2, s22 +; VI-NEXT: v_mov_b32_e32 v3, s23 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -6027,65 +6034,55 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s20 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: byref_natural_align_constant_v16i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x80 +; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x80 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 ; GFX9-NEXT: v_mov_b32_e32 v1, s21 ; GFX9-NEXT: v_mov_b32_e32 v2, s22 ; GFX9-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index f9f343268105ed..e30d9331b0e341 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -9,46 +9,49 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b32 s33, 0 ; CHECK-NEXT: s_mov_b32 s32, 0x180000 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] +; CHECK-NEXT: ; implicit-def: $vgpr40 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v40, s16, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: v_readlane_b32 s14, v40, 0 +; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: s_load_dword s8, s[6:7], 0x0 -; CHECK-NEXT: ; implicit-def: $vgpr40 : SGPR spill to VGPR lane +; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v40, s8, 0 +; CHECK-NEXT: v_writelane_b32 v40, s8, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def vgpr10 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_add_i32 s8, s33, 0x100100 ; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s8 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 s[16:17], 8 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: s_mov_b32 s6, s7 -; CHECK-NEXT: s_mov_b32 s9, s16 -; CHECK-NEXT: s_mov_b32 s7, s17 -; CHECK-NEXT: s_add_u32 s8, s8, s9 -; CHECK-NEXT: s_addc_u32 s6, s6, s7 +; CHECK-NEXT: s_mov_b64 s[18:19], 8 +; CHECK-NEXT: s_mov_b32 s8, s16 +; CHECK-NEXT: s_mov_b32 s9, s17 +; CHECK-NEXT: s_mov_b32 s16, s18 +; CHECK-NEXT: s_mov_b32 s15, s19 +; CHECK-NEXT: s_add_u32 s8, s8, s16 +; CHECK-NEXT: s_addc_u32 s15, s9, s15 ; CHECK-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; CHECK-NEXT: s_mov_b32 s9, s6 +; CHECK-NEXT: s_mov_b32 s9, s15 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x2000 -; CHECK-NEXT: ; implicit-def: $sgpr6 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, device_func@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, device_func@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; CHECK-NEXT: ; implicit-def: $sgpr15 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, device_func@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, device_func@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] -; CHECK-NEXT: s_mov_b32 s6, 20 -; CHECK-NEXT: v_lshlrev_b32_e64 v3, s6, v3 -; CHECK-NEXT: s_mov_b32 s6, 10 -; CHECK-NEXT: v_lshlrev_b32_e64 v2, s6, v2 +; CHECK-NEXT: s_mov_b32 s15, 20 +; CHECK-NEXT: v_lshlrev_b32_e64 v3, s15, v3 +; CHECK-NEXT: s_mov_b32 s15, 10 +; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2 ; CHECK-NEXT: v_or3_b32 v31, v1, v2, v3 -; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] @@ -56,7 +59,7 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_add_i32 s4, s33, 0x100100 ; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s4, v40, 0 +; CHECK-NEXT: v_readlane_b32 s4, v40, 1 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: s_cmp_eq_u32 s4, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000 diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index d51ace630f6925..a744e82c543848 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -93,7 +93,7 @@ define void @use_extern_overalign() #0 { define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -113,27 +113,30 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 +; CHECK-NEXT: s_add_u32 s12, s8, 8 +; CHECK-NEXT: s_addc_u32 s13, s9, 0 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 +; CHECK-NEXT: s_load_dword s17, s[8:9], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b64 s[8:9], s[12:13] +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] -; CHECK-NEXT: s_lshl_b32 s4, s15, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] +; CHECK-NEXT: s_lshl_b32 s4, s17, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_add_i32 s4, s4, 4 @@ -156,7 +159,7 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -176,27 +179,30 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 +; CHECK-NEXT: s_add_u32 s12, s8, 8 +; CHECK-NEXT: s_addc_u32 s13, s9, 0 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 +; CHECK-NEXT: s_load_dword s17, s[8:9], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b64 s[8:9], s[12:13] +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] -; CHECK-NEXT: s_lshl_b32 s4, s15, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] +; CHECK-NEXT: s_lshl_b32 s4, s17, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_add_i32 s4, s4, 8 @@ -219,7 +225,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -239,27 +245,30 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 +; CHECK-NEXT: s_add_u32 s12, s8, 8 +; CHECK-NEXT: s_addc_u32 s13, s9, 0 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 +; CHECK-NEXT: s_load_dword s17, s[8:9], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b64 s[8:9], s[12:13] +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] -; CHECK-NEXT: s_lshl_b32 s4, s15, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] +; CHECK-NEXT: s_lshl_b32 s4, s17, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_add_i32 s4, s4, 8 @@ -282,7 +291,7 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -302,27 +311,30 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 +; CHECK-NEXT: s_add_u32 s12, s8, 8 +; CHECK-NEXT: s_addc_u32 s13, s9, 0 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 +; CHECK-NEXT: s_load_dword s17, s[8:9], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b64 s[8:9], s[12:13] +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] -; CHECK-NEXT: s_lshl_b32 s4, s15, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] +; CHECK-NEXT: s_lshl_b32 s4, s17, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_add_i32 s4, s4, 8 @@ -352,29 +364,31 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s8, s8, 8 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_getpc_b64 s[14:15] +; CHECK-NEXT: s_add_u32 s14, s14, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s15, s15, use_extern_normal@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; CHECK-NEXT: v_mov_b32_e32 v3, 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: s_mov_b32 s15, 0 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: ds_write_b16 v4, v3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_normal @@ -385,37 +399,41 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s8, s8, 8 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_getpc_b64 s[14:15] +; CHECK-NEXT: s_add_u32 s14, s14, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s15, s15, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: s_mov_b32 s15, 4 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_getpc_b64 s[14:15] +; CHECK-NEXT: s_add_u32 s14, s14, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s15, s15, use_extern_normal@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s14, s16 +; CHECK-NEXT: s_mov_b32 s15, 4 ; CHECK-NEXT: ds_write_b16 v1, v0 ; CHECK-NEXT: ds_write_b16 v1, v2 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable @@ -429,29 +447,31 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s8, s8, 8 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_getpc_b64 s[14:15] +; CHECK-NEXT: s_add_u32 s14, s14, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s15, s15, use_extern_normal@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; CHECK-NEXT: v_mov_b32_e32 v3, 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: s_mov_b32 s15, 2 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: ds_write_b16 v4, v3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_overalign @@ -462,37 +482,41 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s8, s8, 8 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_getpc_b64 s[14:15] +; CHECK-NEXT: s_add_u32 s14, s14, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s15, s15, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: s_mov_b32 s15, 6 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_getpc_b64 s[14:15] +; CHECK-NEXT: s_add_u32 s14, s14, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s15, s15, use_extern_normal@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s14, s16 +; CHECK-NEXT: s_mov_b32 s15, 6 ; CHECK-NEXT: ds_write_b16 v1, v0 ; CHECK-NEXT: ds_write_b16 v1, v2 offset:4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable @@ -506,29 +530,31 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s8, s8, 8 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_getpc_b64 s[14:15] +; CHECK-NEXT: s_add_u32 s14, s14, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s15, s15, use_extern_overalign@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; CHECK-NEXT: v_mov_b32_e32 v3, 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: s_mov_b32 s15, 1 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: ds_write_b16 v4, v3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_normal @@ -539,37 +565,41 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s8, s8, 8 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_getpc_b64 s[14:15] +; CHECK-NEXT: s_add_u32 s14, s14, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s15, s15, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: s_mov_b32 s15, 5 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_getpc_b64 s[14:15] +; CHECK-NEXT: s_add_u32 s14, s14, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s15, s15, use_extern_overalign@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s14, s16 +; CHECK-NEXT: s_mov_b32 s15, 5 ; CHECK-NEXT: ds_write_b16 v1, v0 ; CHECK-NEXT: ds_write_b16 v1, v2 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable @@ -583,29 +613,31 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s8, s8, 8 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_getpc_b64 s[14:15] +; CHECK-NEXT: s_add_u32 s14, s14, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s15, s15, use_extern_overalign@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; CHECK-NEXT: v_mov_b32_e32 v3, 2 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: s_mov_b32 s15, 3 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: ds_write_b16 v4, v3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_overalign @@ -616,37 +648,41 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s8, s8, 8 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_getpc_b64 s[14:15] +; CHECK-NEXT: s_add_u32 s14, s14, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s15, s15, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: s_mov_b32 s15, 7 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_getpc_b64 s[14:15] +; CHECK-NEXT: s_add_u32 s14, s14, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s15, s15, use_extern_overalign@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s14, s16 +; CHECK-NEXT: s_mov_b32 s15, 7 ; CHECK-NEXT: ds_write_b16 v1, v0 ; CHECK-NEXT: ds_write_b16 v1, v2 offset:4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable diff --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll index 9899d20cf3ae60..a756a0b3dfecd5 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p) { ; GCN-LABEL: name: load_zeroinit_lds_global ; GCN: bb.0 (%ir-block.0): - ; GCN: liveins: $sgpr2_sgpr3 - ; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll index 99f4fbf3599483..f86c9365d0b798 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll @@ -9,7 +9,7 @@ define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -27,7 +27,7 @@ define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -44,7 +44,7 @@ define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: @@ -60,7 +60,7 @@ define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -80,7 +80,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -99,7 +99,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inre ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -117,7 +117,7 @@ define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 in ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -135,7 +135,7 @@ define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -152,7 +152,7 @@ define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: @@ -168,7 +168,7 @@ define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -188,7 +188,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %r ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -207,7 +207,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> i ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll index be270439ef57c4..ad547a317370bb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pk_i16_i32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[4:5], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll index 50561de5bdbd20..82ac2bd0bc4478 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pk_u16_u32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[4:5], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll index ce6336da4fd962..6cdfcb8c23fc60 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[4:5], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll index 66b4f143c60d07..ebd40c227e0a46 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[4:5], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index 7524c7cbda6cd4..e7e3e5c6be119b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; SI-LABEL: s_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, v0 @@ -32,26 +32,26 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s6, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s6, s7 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3 @@ -65,21 +65,21 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s4, s4 +; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s6, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -87,33 +87,33 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s2, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s4, s4 +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s4, s4 +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) @@ -141,42 +141,42 @@ define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -184,48 +184,48 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v1, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v1, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -242,7 +242,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -258,7 +258,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -275,29 +275,29 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -320,7 +320,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -336,7 +336,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -353,29 +353,29 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -398,42 +398,42 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -441,48 +441,48 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -500,42 +500,42 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -543,48 +543,48 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, -v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, -v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -602,42 +602,42 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -645,48 +645,48 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, -v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, -v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -705,42 +705,42 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -748,48 +748,48 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -|v1|, -v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -|v1|, -v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll index c62a8882c4245e..93bc7155cbfa46 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll @@ -110,7 +110,7 @@ define amdgpu_kernel void @id_i32() #0 { define amdgpu_kernel void @id_arg_i32(i32 %row) #0 { ; GFX11-LABEL: id_arg_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 m0, s0 @@ -119,7 +119,7 @@ define amdgpu_kernel void @id_arg_i32(i32 %row) #0 { ; ; GFX12-LABEL: id_arg_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 m0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll index 18b03efe44efa9..c5becb1602473f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -15,7 +15,7 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -25,17 +25,17 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -46,12 +46,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1) @@ -62,7 +62,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -72,17 +72,17 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| @@ -93,12 +93,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %src_input = call float @llvm.fabs.f32(float %src) @@ -118,7 +118,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -126,7 +126,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f32: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -140,10 +140,10 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -152,11 +152,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -164,11 +164,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -177,11 +177,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -194,10 +194,10 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -206,11 +206,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -218,11 +218,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -231,11 +231,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -248,10 +248,10 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -260,11 +260,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -272,11 +272,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -285,11 +285,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -302,10 +302,10 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -314,11 +314,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -326,11 +326,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -339,11 +339,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -356,10 +356,10 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -368,11 +368,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -380,11 +380,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -393,11 +393,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -410,10 +410,10 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -422,11 +422,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -434,11 +434,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -447,11 +447,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -464,10 +464,10 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -476,11 +476,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -488,11 +488,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -501,11 +501,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -518,10 +518,10 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -530,11 +530,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -542,11 +542,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -555,11 +555,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -572,10 +572,10 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -584,11 +584,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -596,11 +596,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -609,11 +609,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -626,10 +626,10 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -638,11 +638,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -650,11 +650,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -663,11 +663,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -680,10 +680,10 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -692,11 +692,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -704,11 +704,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -717,11 +717,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -734,10 +734,10 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -746,11 +746,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -758,11 +758,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -771,11 +771,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -788,10 +788,10 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -800,11 +800,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -812,11 +812,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -825,11 +825,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -842,10 +842,10 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -854,11 +854,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -866,11 +866,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GISEL-GFX11-LABEL: v_fcmp_f32_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -879,11 +879,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -895,7 +895,7 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -905,17 +905,17 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -926,12 +926,12 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1) store i32 %result, ptr addrspace(1) %out @@ -941,7 +941,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_one: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -951,17 +951,17 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_one: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_one: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -972,12 +972,12 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_one: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6) store i32 %result, ptr addrspace(1) %out @@ -987,7 +987,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -997,17 +997,17 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1018,12 +1018,12 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2) store i32 %result, ptr addrspace(1) %out @@ -1033,7 +1033,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1043,17 +1043,17 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1064,12 +1064,12 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3) store i32 %result, ptr addrspace(1) %out @@ -1079,7 +1079,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_olt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1089,17 +1089,17 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_olt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_olt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1110,12 +1110,12 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_olt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4) store i32 %result, ptr addrspace(1) %out @@ -1125,7 +1125,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ole: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1135,17 +1135,17 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ole: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ole: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1156,12 +1156,12 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ole: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5) store i32 %result, ptr addrspace(1) %out @@ -1171,7 +1171,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1181,17 +1181,17 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1202,12 +1202,12 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9) store i32 %result, ptr addrspace(1) %out @@ -1217,7 +1217,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_o: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1227,17 +1227,17 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_o: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_o: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1248,12 +1248,12 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_o: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 7) store i32 %result, ptr addrspace(1) %out @@ -1263,7 +1263,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uo: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1273,17 +1273,17 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uo: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uo: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1294,12 +1294,12 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uo: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 8) store i32 %result, ptr addrspace(1) %out @@ -1309,7 +1309,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_une: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1319,17 +1319,17 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_une: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_une: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1340,12 +1340,12 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_une: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14) store i32 %result, ptr addrspace(1) %out @@ -1355,7 +1355,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1365,17 +1365,17 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1386,12 +1386,12 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10) store i32 %result, ptr addrspace(1) %out @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1411,17 +1411,17 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1432,12 +1432,12 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11) store i32 %result, ptr addrspace(1) %out @@ -1447,7 +1447,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ult: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1457,17 +1457,17 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ult: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1478,12 +1478,12 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12) store i32 %result, ptr addrspace(1) %out @@ -1493,7 +1493,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ule: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1503,17 +1503,17 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ule: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1524,12 +1524,12 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13) store i32 %result, ptr addrspace(1) %out @@ -1541,12 +1541,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; SDAG-GFX11-NEXT: s_endpgm @@ -1554,12 +1554,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; SDAG-GFX10-NEXT: s_lshr_b32 s3, s2, 16 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1567,13 +1567,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_endpgm @@ -1581,12 +1581,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; GISEL-GFX10-NEXT: s_lshr_b32 s3, s2, 16 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1601,12 +1601,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; SDAG-GFX11-NEXT: s_endpgm @@ -1614,12 +1614,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; SDAG-GFX10-NEXT: s_lshr_b32 s3, s2, 16 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1627,13 +1627,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_endpgm @@ -1641,12 +1641,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; GISEL-GFX10-NEXT: s_lshr_b32 s3, s2, 16 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1668,7 +1668,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1676,7 +1676,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f16: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -1691,10 +1691,10 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1703,11 +1703,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1715,11 +1715,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1728,11 +1728,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1746,10 +1746,10 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1758,11 +1758,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1770,11 +1770,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1783,11 +1783,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1801,10 +1801,10 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1813,11 +1813,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1825,11 +1825,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1838,11 +1838,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1856,10 +1856,10 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1868,11 +1868,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1880,11 +1880,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1893,11 +1893,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1911,10 +1911,10 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1923,11 +1923,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1935,11 +1935,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1948,11 +1948,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1966,10 +1966,10 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1978,11 +1978,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1990,11 +1990,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2003,11 +2003,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -2021,10 +2021,10 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2033,11 +2033,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -2045,11 +2045,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2058,11 +2058,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -2076,10 +2076,10 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2088,11 +2088,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -2100,11 +2100,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2113,11 +2113,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -2131,10 +2131,10 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2143,11 +2143,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -2155,11 +2155,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2168,11 +2168,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -2186,10 +2186,10 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2198,11 +2198,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -2210,11 +2210,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2223,11 +2223,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -2241,10 +2241,10 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2253,11 +2253,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -2265,11 +2265,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2278,11 +2278,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -2295,10 +2295,10 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2307,11 +2307,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -2319,11 +2319,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2332,11 +2332,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -2349,10 +2349,10 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2361,11 +2361,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -2373,11 +2373,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2386,11 +2386,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -2403,10 +2403,10 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2415,11 +2415,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -2427,11 +2427,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GISEL-GFX11-LABEL: v_fcmp_f16_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2440,11 +2440,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index 2dddf37febf945..ec100a9e5b0f8d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -16,7 +16,7 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| @@ -28,19 +28,19 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| @@ -53,7 +53,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| @@ -72,7 +72,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |s3| @@ -84,19 +84,19 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], |s6|, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| @@ -109,7 +109,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| @@ -133,7 +133,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GFX11-GISEL-LABEL: v_fcmp_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -145,7 +145,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GFX9-GISEL-LABEL: v_fcmp_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -157,7 +157,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -172,11 +172,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -185,12 +185,12 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -198,11 +198,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -212,11 +212,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -232,11 +232,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -245,12 +245,12 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -258,11 +258,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -272,11 +272,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -292,11 +292,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -305,12 +305,12 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -318,11 +318,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -332,11 +332,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -352,11 +352,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -365,12 +365,12 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -378,11 +378,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -392,11 +392,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -412,11 +412,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -425,12 +425,12 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -438,11 +438,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -452,11 +452,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -472,11 +472,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -485,12 +485,12 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -498,11 +498,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -512,11 +512,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -532,11 +532,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -545,12 +545,12 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -558,11 +558,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -572,11 +572,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -592,11 +592,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -605,12 +605,12 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -618,11 +618,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -632,11 +632,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -652,11 +652,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -665,12 +665,12 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -678,11 +678,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -692,11 +692,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -712,11 +712,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -725,12 +725,12 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -738,11 +738,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -752,11 +752,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -772,11 +772,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -785,12 +785,12 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -798,11 +798,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -812,11 +812,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -832,11 +832,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -845,12 +845,12 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -858,11 +858,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -872,11 +872,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -892,11 +892,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -905,12 +905,12 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -918,11 +918,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -932,11 +932,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -952,11 +952,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -965,12 +965,12 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -978,11 +978,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; VI-SDAG-LABEL: v_fcmp_f32_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -992,11 +992,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oeq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1023,20 +1023,20 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1050,7 +1050,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1069,7 +1069,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_one: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1081,20 +1081,20 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1108,7 +1108,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1127,7 +1127,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ogt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1139,20 +1139,20 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1166,7 +1166,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1197,20 +1197,20 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1224,7 +1224,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1243,7 +1243,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_olt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1255,20 +1255,20 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1301,7 +1301,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ole: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1313,20 +1313,20 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1340,7 +1340,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1359,7 +1359,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ueq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1371,20 +1371,20 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1398,7 +1398,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1417,7 +1417,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_o: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1429,20 +1429,20 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1456,7 +1456,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1475,7 +1475,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_u_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1487,20 +1487,20 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1514,7 +1514,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1533,7 +1533,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_une: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1545,20 +1545,20 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1572,7 +1572,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1591,7 +1591,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ugt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nge_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1603,20 +1603,20 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1630,7 +1630,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1649,7 +1649,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ngt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1661,20 +1661,20 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1688,7 +1688,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1707,7 +1707,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ult: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nle_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1719,20 +1719,20 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1765,7 +1765,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ule: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1777,20 +1777,20 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f64_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1804,7 +1804,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1825,13 +1825,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |s2| +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |s3| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1840,13 +1840,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; GFX9-LABEL: v_fcmp_f16_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1854,12 +1854,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_lshr_b32 s2, s4, 16 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| +; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1869,12 +1869,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_lshr_b32 s2, s4, 16 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| +; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1892,13 +1892,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |s2| +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |s3| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1907,13 +1907,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1921,12 +1921,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_lshr_b32 s2, s4, 16 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| +; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1936,12 +1936,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_lshr_b32 s2, s4, 16 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| +; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1962,7 +1962,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GFX11-GISEL-LABEL: v_fcmp_f16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1974,7 +1974,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GFX9-GISEL-LABEL: v_fcmp_f16: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -1986,7 +1986,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -2002,11 +2002,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2015,12 +2015,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2028,11 +2028,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2042,11 +2042,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2063,11 +2063,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2076,12 +2076,12 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2089,11 +2089,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2103,11 +2103,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2124,11 +2124,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2137,12 +2137,12 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2150,11 +2150,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2164,11 +2164,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2185,11 +2185,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2198,12 +2198,12 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2211,11 +2211,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2225,11 +2225,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2246,11 +2246,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2259,12 +2259,12 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2272,11 +2272,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2286,11 +2286,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2307,11 +2307,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2320,12 +2320,12 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2333,11 +2333,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2347,11 +2347,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2368,11 +2368,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2381,12 +2381,12 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2394,11 +2394,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2408,11 +2408,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2429,11 +2429,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2442,12 +2442,12 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2455,11 +2455,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2469,11 +2469,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2490,11 +2490,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2503,12 +2503,12 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2516,11 +2516,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2530,11 +2530,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2551,11 +2551,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2564,12 +2564,12 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2577,11 +2577,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2591,11 +2591,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2612,11 +2612,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2625,12 +2625,12 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2638,11 +2638,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2652,11 +2652,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2672,11 +2672,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2685,12 +2685,12 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2698,11 +2698,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2712,11 +2712,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2732,11 +2732,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2745,12 +2745,12 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2758,11 +2758,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2772,11 +2772,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2792,11 +2792,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2805,12 +2805,12 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2818,11 +2818,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; VI-SDAG-LABEL: v_fcmp_f16_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2832,11 +2832,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll index 455323d01eb3fe..dea0cc5fd07b66 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -8,7 +8,7 @@ declare bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> %a, <2 x bfloat> %b, bf define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] @@ -34,7 +34,7 @@ entry: define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: scratch_load_b32 v0, off, s2 ; GFX11-NEXT: scratch_load_u16 v1, off, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll index 9cf24539828b3c..066edea9698837 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll @@ -7,7 +7,7 @@ declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c) define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] @@ -33,7 +33,7 @@ entry: define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( ; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp: ; SDAG-GFX11: ; %bb.0: ; %entry -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2 ; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3 @@ -45,7 +45,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( ; ; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp: ; GISEL-GFX11: ; %bb.0: ; %entry -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s1 ; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index 5a2c4197eef5de..b0ef568fbdce31 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -7,7 +7,7 @@ declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, floa define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -35,7 +35,7 @@ entry: define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll index a97f1dcc2bdd49..2afa9ba14ceae9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll @@ -7,7 +7,7 @@ declare i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1), i64) define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) %addr, i64 %in) { ; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -16,7 +16,7 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) ; ; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_no_rtn: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -31,28 +31,27 @@ entry: define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %addr, i64 %in, ptr addrspace(1) %use) { ; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_rtn: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_rtn: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index e19a4bcf9e0a7b..8427b4e7f6f35a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -10,7 +10,7 @@ declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b64_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 @@ -27,7 +27,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128_v8i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 @@ -44,7 +44,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128_v8f16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 @@ -61,7 +61,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128_v8bf16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index 5e1fe792393b9a..be4fa79951daff 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -10,7 +10,7 @@ declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16.p1(ptr addrspace(1) define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b64_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 @@ -27,7 +27,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128_v4i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128_v4i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 @@ -44,7 +44,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128_v4f16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 @@ -61,7 +61,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128_v4bf16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index 37174dec520209..260b6fb39acb9a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -22,10 +22,10 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -34,11 +34,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -46,11 +46,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-LABEL: v_icmp_i32_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -59,11 +59,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -83,7 +83,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -91,7 +91,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i32: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -105,10 +105,10 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -117,11 +117,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -129,11 +129,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-LABEL: v_icmp_i32_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -142,11 +142,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -159,10 +159,10 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -171,11 +171,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -183,11 +183,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-LABEL: v_icmp_i32_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -196,11 +196,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -213,10 +213,10 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -225,11 +225,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -237,11 +237,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-LABEL: v_icmp_i32_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -250,11 +250,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -267,10 +267,10 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -279,11 +279,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -291,11 +291,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-LABEL: v_icmp_i32_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -304,11 +304,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -321,10 +321,10 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -333,11 +333,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -345,11 +345,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-LABEL: v_icmp_i32_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -358,11 +358,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -375,10 +375,10 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX11-LABEL: v_icmp_i32_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -387,11 +387,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX10-LABEL: v_icmp_i32_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -399,11 +399,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GISEL-GFX11-LABEL: v_icmp_i32_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -412,11 +412,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GISEL-GFX10-LABEL: v_icmp_i32_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -429,10 +429,10 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -441,11 +441,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -453,11 +453,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-LABEL: v_icmp_i32_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -466,11 +466,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -483,10 +483,10 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -495,11 +495,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -507,11 +507,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-LABEL: v_icmp_i32_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -520,11 +520,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -537,10 +537,10 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -549,11 +549,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -561,11 +561,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX11-LABEL: v_icmp_i32_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -574,11 +574,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -590,7 +590,7 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_eq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -600,17 +600,17 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_eq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i64_eq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -621,12 +621,12 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_eq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -636,7 +636,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_ne: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -646,17 +646,17 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_ne: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i64_ne: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -667,12 +667,12 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_ne: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33) store i32 %result, ptr addrspace(1) %out @@ -682,7 +682,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ugt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -692,17 +692,17 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_u64_ugt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -713,12 +713,12 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34) store i32 %result, ptr addrspace(1) %out @@ -728,7 +728,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_uge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -738,17 +738,17 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_u64_uge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -759,12 +759,12 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35) store i32 %result, ptr addrspace(1) %out @@ -774,7 +774,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ult: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -784,17 +784,17 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_u64_ult: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -805,12 +805,12 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36) store i32 %result, ptr addrspace(1) %out @@ -820,7 +820,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ule: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -830,17 +830,17 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_u64_ule: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -851,12 +851,12 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37) store i32 %result, ptr addrspace(1) %out @@ -866,7 +866,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sgt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -876,17 +876,17 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sgt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i64_sgt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -897,12 +897,12 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sgt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38) store i32 %result, ptr addrspace(1) %out @@ -912,7 +912,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -922,17 +922,17 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i64_sge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -943,12 +943,12 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39) store i32 %result, ptr addrspace(1) %out @@ -958,7 +958,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_slt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -968,17 +968,17 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_slt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i64_slt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -989,12 +989,12 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_slt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40) store i32 %result, ptr addrspace(1) %out @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sle: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1014,17 +1014,17 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sle: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7] -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i64_sle: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1035,12 +1035,12 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sle: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7] -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41) store i32 %result, ptr addrspace(1) %out @@ -1051,10 +1051,10 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1063,11 +1063,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1075,11 +1075,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-LABEL: v_icmp_i16_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1088,11 +1088,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1120,7 +1120,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i16: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -1134,10 +1134,10 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1146,11 +1146,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1158,11 +1158,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-LABEL: v_icmp_i16_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1171,11 +1171,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1188,10 +1188,10 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1200,11 +1200,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1212,11 +1212,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-LABEL: v_icmp_i16_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1225,11 +1225,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1242,10 +1242,10 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1254,11 +1254,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1266,11 +1266,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-LABEL: v_icmp_i16_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1279,11 +1279,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1296,10 +1296,10 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1308,11 +1308,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1320,11 +1320,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-LABEL: v_icmp_i16_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1333,11 +1333,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1350,10 +1350,10 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1362,11 +1362,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1374,11 +1374,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-LABEL: v_icmp_i16_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1387,11 +1387,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1404,10 +1404,10 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX11-LABEL: v_icmp_i16_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1416,11 +1416,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX10-LABEL: v_icmp_i16_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1428,11 +1428,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GISEL-GFX11-LABEL: v_icmp_i16_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1441,11 +1441,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GISEL-GFX10-LABEL: v_icmp_i16_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1458,10 +1458,10 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1470,11 +1470,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1482,11 +1482,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-LABEL: v_icmp_i16_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1495,11 +1495,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1512,10 +1512,10 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1524,11 +1524,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1536,11 +1536,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-LABEL: v_icmp_i16_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1549,11 +1549,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1566,10 +1566,10 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1578,11 +1578,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; SDAG-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX10-NEXT: s_endpgm @@ -1590,11 +1590,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX11-LABEL: v_icmp_i16_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1603,11 +1603,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; GISEL-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX10-NEXT: s_endpgm @@ -1619,7 +1619,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 @@ -1633,16 +1633,16 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX10-LABEL: v_icmp_i1_ne0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_gt_u32 s6, 1 -; GFX10-NEXT: s_cselect_b32 s0, -1, 0 -; GFX10-NEXT: s_cmp_gt_u32 s7, 2 -; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: s_and_b32 s0, s0, s1 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_cmp_gt_u32 s2, 1 +; GFX10-NEXT: s_cselect_b32 s2, -1, 0 +; GFX10-NEXT: s_cmp_gt_u32 s3, 2 +; GFX10-NEXT: s_cselect_b32 s3, -1, 0 +; GFX10-NEXT: s_and_b32 s2, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index f5f3bc43658e7f..13a53f0b96de2d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -25,11 +25,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -38,11 +38,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -52,12 +52,12 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -65,11 +65,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -96,7 +96,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -104,7 +104,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -113,7 +113,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX9-LABEL: v_icmp_i32: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -127,11 +127,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -140,11 +140,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -154,12 +154,12 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -167,11 +167,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -187,11 +187,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -200,11 +200,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -214,12 +214,12 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -227,11 +227,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -247,11 +247,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -260,11 +260,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -274,12 +274,12 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -287,11 +287,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -307,11 +307,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -320,11 +320,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -334,12 +334,12 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -347,11 +347,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -367,11 +367,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -380,11 +380,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -394,12 +394,12 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -407,11 +407,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -427,11 +427,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GFX11-LABEL: v_icmp_i32_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -440,11 +440,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; SDAG-VI-LABEL: v_icmp_i32_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -454,12 +454,12 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; GFX9-LABEL: v_icmp_i32_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -467,11 +467,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; GISEL-VI-LABEL: v_icmp_i32_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -487,11 +487,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -500,11 +500,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -514,12 +514,12 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -527,11 +527,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -547,11 +547,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -560,11 +560,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -574,12 +574,12 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -587,11 +587,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -607,11 +607,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -620,11 +620,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -634,12 +634,12 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -647,11 +647,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -666,7 +666,7 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_eq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e64 s[2:3], 0x64, s[2:3] @@ -678,7 +678,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -692,20 +692,20 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -724,7 +724,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_ne: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u64_e64 s[2:3], 0x64, s[2:3] @@ -736,7 +736,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -750,20 +750,20 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -782,7 +782,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ugt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_u64_e64 s[2:3], 0x64, s[2:3] @@ -794,7 +794,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -808,20 +808,20 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_u64_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -840,7 +840,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_uge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e64 s[2:3], 0x64, s[2:3] @@ -852,7 +852,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -866,20 +866,20 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_u64_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -898,7 +898,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ult: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_u64_e64 s[2:3], 0x64, s[2:3] @@ -910,7 +910,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -924,20 +924,20 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_u64_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -956,7 +956,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ule: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_u64_e64 s[2:3], 0x64, s[2:3] @@ -968,7 +968,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -982,20 +982,20 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_u64_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1014,7 +1014,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sgt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e64 s[2:3], 0x64, s[2:3] @@ -1026,7 +1026,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1040,20 +1040,20 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e64 s[2:3], 0x64, s[2:3] @@ -1084,7 +1084,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1098,20 +1098,20 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1130,7 +1130,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_slt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_i64_e64 s[2:3], 0x64, s[2:3] @@ -1142,7 +1142,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1156,20 +1156,20 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1188,7 +1188,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sle: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_i64_e64 s[2:3], 0x64, s[2:3] @@ -1200,7 +1200,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1214,20 +1214,20 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i64_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1247,11 +1247,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1260,11 +1260,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1274,12 +1274,12 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1287,11 +1287,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1318,7 +1318,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1326,7 +1326,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX9-LABEL: v_icmp_i16: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -1349,11 +1349,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1362,11 +1362,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1376,12 +1376,12 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1389,11 +1389,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1409,11 +1409,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1422,11 +1422,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1436,12 +1436,12 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1449,11 +1449,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1469,11 +1469,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1482,11 +1482,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1496,12 +1496,12 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1509,11 +1509,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1529,11 +1529,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1542,11 +1542,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1556,12 +1556,12 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1569,11 +1569,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1589,11 +1589,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1602,11 +1602,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1616,12 +1616,12 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1629,11 +1629,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1649,11 +1649,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GFX11-LABEL: v_icmp_i16_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1662,11 +1662,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; SDAG-VI-LABEL: v_icmp_i16_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1676,12 +1676,12 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; GFX9-LABEL: v_icmp_i16_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1689,11 +1689,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; GISEL-VI-LABEL: v_icmp_i16_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1709,11 +1709,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1722,11 +1722,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1736,12 +1736,12 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1749,11 +1749,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1769,11 +1769,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1782,11 +1782,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1796,12 +1796,12 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1809,11 +1809,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1829,11 +1829,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1842,11 +1842,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1856,12 +1856,12 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 +; GFX9-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1869,11 +1869,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1888,7 +1888,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 @@ -1904,7 +1904,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: v_icmp_i1_ne0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_gt_u32 s2, 1 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -1920,17 +1920,17 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX9-LABEL: v_icmp_i1_ne0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_gt_u32 s6, 1 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_cmp_gt_u32 s7, 2 +; GFX9-NEXT: s_cmp_gt_u32 s2, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_u32 s3, 2 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 3168e05b816bee..b532aa9cd7e86a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -14,7 +14,7 @@ entry: define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 @@ -152,7 +152,7 @@ entry: define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 0393a551dcd41b..1b41a10eec3fd0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -163,7 +163,7 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 @@ -171,10 +171,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v2, s0, s4, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s5, 0, s0 -; GFX1013-NEXT: v_add_co_u32 v4, s0, s6, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s7, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v2, s0, s8, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s9, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v4, s0, s10, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s11, 0, s0 ; GFX1013-NEXT: flat_load_dword v0, v[2:3] ; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -182,14 +182,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[8:11] +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[12:15] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 @@ -215,7 +215,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 @@ -262,15 +262,15 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v2, s0, s4, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s5, 0, s0 -; GFX1013-NEXT: v_add_co_u32 v4, s0, s6, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s7, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v2, s0, s8, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s9, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v4, s0, s10, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s11, 0, s0 ; GFX1013-NEXT: flat_load_dword v0, v[2:3] ; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -278,14 +278,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16 +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 @@ -308,7 +308,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -353,8 +353,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 @@ -366,22 +366,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v0, s0, s0, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v0, s4, s6, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 ; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7] +; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1030-NEXT: s_clause 0x1 +; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 @@ -392,8 +393,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 +; GFX1030-NEXT: v_add_co_u32 v0, s4, s6, v0 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 ; GFX1030-NEXT: flat_load_dword v2, v[0:1] ; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 @@ -405,9 +406,10 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) @@ -416,9 +418,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 ; GFX11-NEXT: v_bfrev_b32_e32 v10, 4.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 +; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 ; GFX11-NEXT: flat_load_b32 v11, v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 @@ -449,8 +451,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body ; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 @@ -459,22 +461,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v0, s0, s0, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v0, s4, s6, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16 +; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1030-NEXT: s_clause 0x1 +; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -482,8 +485,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 +; GFX1030-NEXT: v_add_co_u32 v0, s4, s6, v0 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 ; GFX1030-NEXT: flat_load_dword v2, v[0:1] ; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 @@ -495,20 +498,21 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 ; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0 ; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 +; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 ; GFX11-NEXT: flat_load_b32 v8, v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 80cd97c0c262c0..a585b49ef8d9a5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; SI-LABEL: is_private_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x32 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s4, s[8:9], 0x32 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -27,8 +27,8 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; CI-SDAG-LABEL: is_private_vgpr: ; CI-SDAG: ; %bb.0: -; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-SDAG-NEXT: s_load_dword s2, s[6:7], 0x32 +; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x32 ; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -43,7 +43,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX9-LABEL: is_private_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc @@ -56,8 +56,8 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; CI-GISEL-LABEL: is_private_vgpr: ; CI-GISEL: ; %bb.0: -; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-GISEL-NEXT: s_load_dword s2, s[6:7], 0x32 +; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x32 ; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -73,7 +73,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX10-LABEL: is_private_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc @@ -87,7 +87,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX11-LABEL: is_private_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -113,8 +113,8 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; SI-LABEL: is_private_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[6:7], 0x1 -; SI-NEXT: s_load_dword s1, s[6:7], 0x32 +; SI-NEXT: s_load_dword s0, s[8:9], 0x1 +; SI-NEXT: s_load_dword s1, s[8:9], 0x32 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s0, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -131,8 +131,8 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; CI-SDAG-LABEL: is_private_sgpr: ; CI-SDAG: ; %bb.0: -; CI-SDAG-NEXT: s_load_dword s0, s[6:7], 0x1 -; CI-SDAG-NEXT: s_load_dword s1, s[6:7], 0x32 +; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1 +; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x32 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1 ; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX9-SDAG-LABEL: is_private_sgpr: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dword s2, s[6:7], 0x4 +; GFX9-SDAG-NEXT: s_load_dword s2, s[8:9], 0x4 ; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_cmp_eq_u32 s2, s1 @@ -163,9 +163,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; CI-GISEL-LABEL: is_private_sgpr: ; CI-GISEL: ; %bb.0: -; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x32 +; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x32 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0 ; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 @@ -178,7 +178,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX9-GISEL-LABEL: is_private_sgpr: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_cmp_lg_u32 s1, s3 @@ -192,7 +192,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX10-LABEL: is_private_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s1, s3 @@ -206,7 +206,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX11-LABEL: is_private_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index 8383621cef2f6b..dc621f15709fd5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -43,8 +43,8 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; SI-LABEL: is_local_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x33 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s4, s[8:9], 0x33 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -60,8 +60,8 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; CI-SDAG-LABEL: is_local_vgpr: ; CI-SDAG: ; %bb.0: -; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-SDAG-NEXT: s_load_dword s2, s[6:7], 0x33 +; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x33 ; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -76,7 +76,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX9-LABEL: is_local_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc @@ -89,8 +89,8 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; CI-GISEL-LABEL: is_local_vgpr: ; CI-GISEL: ; %bb.0: -; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-GISEL-NEXT: s_load_dword s2, s[6:7], 0x33 +; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x33 ; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -106,7 +106,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX10-LABEL: is_local_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc @@ -120,7 +120,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX11-LABEL: is_local_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -180,8 +180,8 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; SI-LABEL: is_local_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[6:7], 0x1 -; SI-NEXT: s_load_dword s1, s[6:7], 0x33 +; SI-NEXT: s_load_dword s0, s[8:9], 0x1 +; SI-NEXT: s_load_dword s1, s[8:9], 0x33 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s0, s1 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -198,8 +198,8 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; CI-SDAG-LABEL: is_local_sgpr: ; CI-SDAG: ; %bb.0: -; CI-SDAG-NEXT: s_load_dword s0, s[6:7], 0x1 -; CI-SDAG-NEXT: s_load_dword s1, s[6:7], 0x33 +; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1 +; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x33 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1 ; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -214,7 +214,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX9-SDAG-LABEL: is_local_sgpr: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dword s2, s[6:7], 0x4 +; GFX9-SDAG-NEXT: s_load_dword s2, s[8:9], 0x4 ; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_cmp_eq_u32 s2, s1 @@ -230,9 +230,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; CI-GISEL-LABEL: is_local_sgpr: ; CI-GISEL: ; %bb.0: -; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x33 +; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x33 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0 ; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 @@ -245,7 +245,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX9-GISEL-LABEL: is_local_sgpr: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_cmp_lg_u32 s1, s3 @@ -259,7 +259,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX10-LABEL: is_local_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s1, s3 @@ -273,7 +273,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX11-LABEL: is_local_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index c201f84cac7268..97219a8f143ce6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -23,8 +23,8 @@ define void @function_lds_id(ptr addrspace(1) %out) { define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: kernel_lds_id: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s2, s10, 42 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s2, s12, 42 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -42,28 +42,30 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l ; GCN-LABEL: indirect_lds_id: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s8, s6, 8 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[8:9], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, 8 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_addc_u32 s9, s7, 0 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, function_lds_id@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, function_lds_id@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, function_lds_id@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, function_lds_id@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[20:21], s[14:15], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 21 +; GCN-NEXT: s_mov_b32 s14, s16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GCN-NEXT: s_endpgm call void @function_lds_id(ptr addrspace(1) %out) ret void @@ -72,7 +74,7 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: doesnt_use_it: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index aa6069c67f62ee..71961a57bd080d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -15,37 +15,37 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vss_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s3, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -56,37 +56,37 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float ; GFX10-LABEL: v_permlane16_b32_vss_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s3, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -97,85 +97,85 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s6, s7 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s6, s7 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s6, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s6, s7 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -186,85 +186,85 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s6, s7 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s6, s7 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s6, s7 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s6, s7 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -275,11 +275,11 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vii_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -287,10 +287,10 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 % ; GFX11-LABEL: v_permlane16_b32_vii_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -298,7 +298,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-LABEL: v_permlane16_b32_vii_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -314,11 +314,11 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float ; GFX10-LABEL: v_permlane16_b32_vii_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -326,10 +326,10 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float ; GFX11-LABEL: v_permlane16_b32_vii_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -337,7 +337,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float ; ; GFX12-LABEL: v_permlane16_b32_vii_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -352,31 +352,31 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 ; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -388,7 +388,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -400,7 +400,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -412,7 +412,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -429,31 +429,31 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, 1, 2 ; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, 1, 2 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -465,7 +465,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -477,7 +477,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -489,7 +489,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -508,12 +508,12 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vll_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -521,19 +521,19 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % ; GFX11-LABEL: v_permlane16_b32_vll_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vll_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -550,33 +550,33 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, 0xc1d1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, 0xc1d1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -590,7 +590,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -604,7 +604,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -618,7 +618,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -638,12 +638,12 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float ; GFX10-LABEL: v_permlane16_b32_vll_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -651,19 +651,19 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float ; GFX11-LABEL: v_permlane16_b32_vll_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vll_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -680,33 +680,33 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vll_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, 0xc1d1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vll_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, 0xc1d1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vll_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -720,7 +720,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vll_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -734,7 +734,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vll_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -748,7 +748,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vll_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -765,59 +765,73 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl } define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlane16_b32_vvv_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -833,7 +847,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -855,35 +869,37 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vvv_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -901,7 +917,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-LABEL: v_permlane16_b32_vvv_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -924,59 +940,73 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % } define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float %src0) { -; GFX10-LABEL: v_permlane16_b32_vvv_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -992,7 +1022,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -1014,35 +1044,37 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vvv_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -1060,7 +1092,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; ; GFX12-LABEL: v_permlane16_b32_vvv_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -1085,29 +1117,30 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s7 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s7 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1119,7 +1152,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 @@ -1131,7 +1164,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1143,7 +1176,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 @@ -1162,65 +1195,67 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 % ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s6 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s6 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s6 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vvs_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_permlane16_b32 v1, v1, s5, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s5, s4 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vvs_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_permlane16_b32 v1, v1, s5, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s5, s4 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -1231,29 +1266,30 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 % define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s7 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s7 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1265,7 +1301,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 @@ -1277,7 +1313,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1289,7 +1325,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 @@ -1308,65 +1344,67 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s6 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s6 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s6 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vvs_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_permlane16_b32 v1, v1, s5, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s5, s4 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vvs_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_permlane16_b32 v1, v1, s5, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s5, s4 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -1375,20 +1413,32 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl } define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-LABEL: v_permlane16_b32_vsv_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1401,7 +1451,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1414,7 +1464,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1427,7 +1477,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1447,99 +1497,101 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s6, s4 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s6, s4 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s6, s4 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s6, s4 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) @@ -1548,20 +1600,32 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % } define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { -; GFX10-LABEL: v_permlane16_b32_vsv_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1574,7 +1638,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1587,7 +1651,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1600,7 +1664,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1620,99 +1684,101 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s6, s4 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s6, s4 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s6, s4 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s6, s4 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false) @@ -1724,37 +1790,37 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3 ; GFX10-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s3, s6 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i32 %v, ptr addrspace(1) %out @@ -1765,85 +1831,85 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s6, s7 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s6, s7 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s6, s7 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s6, s7 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i64 %v, ptr addrspace(1) %out @@ -1854,37 +1920,37 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl ; GFX10-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s3, s6 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) store float %v, ptr addrspace(1) %out @@ -1895,85 +1961,85 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s6, s7 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s6, s7 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s6, s7 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s6, s7 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false) store double %v, ptr addrspace(1) %out @@ -1984,37 +2050,37 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3 ; GFX10-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s3, s6 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i32 %v, ptr addrspace(1) %out @@ -2025,85 +2091,85 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s6, s7 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s6, s7 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s6, s7 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s6, s7 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i64 %v, ptr addrspace(1) %out @@ -2114,37 +2180,37 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl ; GFX10-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s3, s6 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) store float %v, ptr addrspace(1) %out @@ -2155,85 +2221,85 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s6, s7 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s6, s7 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s6, s7 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s6, s7 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true) store double %v, ptr addrspace(1) %out @@ -2244,37 +2310,37 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s3, s6 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i32 %v, ptr addrspace(1) %out @@ -2285,85 +2351,85 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s6, s7 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s6, s7 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s6, s7 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s6, s7 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i64 %v, ptr addrspace(1) %out @@ -2374,37 +2440,37 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s3, s6 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) store float %v, ptr addrspace(1) %out @@ -2415,85 +2481,85 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s6, s7 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s6, s7 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s6, s7 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s6, s7 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true) store double %v, ptr addrspace(1) %out @@ -2504,37 +2570,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vss_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s3, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -2545,37 +2611,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vss_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s3, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false) store float %v, ptr addrspace(1) %out @@ -2586,85 +2652,85 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s6, s7 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s6, s7 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s6, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s6, s7 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -2675,85 +2741,85 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s6, s7 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s6, s7 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s6, s7 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s6, s7 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -2764,11 +2830,11 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vii_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2776,10 +2842,10 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 ; GFX11-LABEL: v_permlanex16_b32_vii_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2787,7 +2853,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 ; ; GFX12-LABEL: v_permlanex16_b32_vii_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2803,11 +2869,11 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vii_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2815,10 +2881,10 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa ; GFX11-LABEL: v_permlanex16_b32_vii_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2826,7 +2892,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa ; ; GFX12-LABEL: v_permlanex16_b32_vii_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2841,31 +2907,31 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 ; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -2877,7 +2943,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2889,7 +2955,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -2901,7 +2967,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2918,31 +2984,31 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, 1, 2 ; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, 1, 2 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -2954,7 +3020,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2966,7 +3032,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -2978,7 +3044,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2997,12 +3063,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vll_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -3010,19 +3076,19 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 ; GFX11-LABEL: v_permlanex16_b32_vll_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vll_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -3040,12 +3106,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vll_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_movk_i32 s2, 0x1234 ; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -3053,19 +3119,19 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa ; GFX11-LABEL: v_permlanex16_b32_vll_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vll_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -3082,33 +3148,33 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, 0xc1d1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, 0xc1d1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3122,7 +3188,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3136,7 +3202,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3150,7 +3216,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3169,33 +3235,33 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, 0xc1d1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_movk_i32 s0, 0x1234 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, 0xc1d1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3209,7 +3275,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3223,7 +3289,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3237,7 +3303,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3254,59 +3320,73 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub } define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 %src0) { -; GFX10-LABEL: v_permlanex16_b32_vvv_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -3322,7 +3402,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -3342,59 +3422,73 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, float %src0) { -; GFX10-LABEL: v_permlanex16_b32_vvv_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -3410,7 +3504,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -3432,35 +3526,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vvv_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -3478,7 +3574,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; ; GFX12-LABEL: v_permlanex16_b32_vvv_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -3503,35 +3599,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vvv_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -3549,7 +3647,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; ; GFX12-LABEL: v_permlanex16_b32_vvv_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -3574,29 +3672,30 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s7 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s7 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -3608,7 +3707,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 @@ -3620,7 +3719,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -3632,7 +3731,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 @@ -3650,29 +3749,30 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s7 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s7 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -3684,7 +3784,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 @@ -3696,7 +3796,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -3708,7 +3808,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 @@ -3727,65 +3827,67 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s6 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s6 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s6 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vvs_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_permlanex16_b32 v1, v1, s5, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s5, s4 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vvs_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_permlanex16_b32 v1, v1, s5, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s5, s4 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -3797,65 +3899,67 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s6 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s6 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s6 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vvs_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_permlanex16_b32 v1, v1, s5, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s5, s4 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vvs_f64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: v_permlanex16_b32 v1, v1, s5, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s5, s4 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -3864,20 +3968,32 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub } define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-LABEL: v_permlanex16_b32_vsv_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -3890,7 +4006,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -3903,7 +4019,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -3916,7 +4032,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -3933,20 +4049,32 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { -; GFX10-LABEL: v_permlanex16_b32_vsv_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 +; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -3959,7 +4087,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -3972,7 +4100,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -3985,7 +4113,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4005,99 +4133,101 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s6, s4 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s6, s4 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s6, s4 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s6, s4 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false) @@ -4109,99 +4239,101 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s6, s4 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s6, s4 +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s6, s4 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s6, s4 +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s5, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false) @@ -4213,37 +4345,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlanex16_b32_vss_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s3, s6 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i32 %v, ptr addrspace(1) %out @@ -4254,37 +4386,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f ; GFX10-LABEL: v_permlanex16_b32_vss_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s3, s6 op_sel:[1,0] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false) store float %v, ptr addrspace(1) %out @@ -4295,85 +4427,85 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s6, s7 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s6, s7 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s6, s7 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s6, s7 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false) store i64 %v, ptr addrspace(1) %out @@ -4384,85 +4516,85 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s6, s7 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s6, s7 op_sel:[1,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s6, s7 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s6, s7 op_sel:[1,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false) store double %v, ptr addrspace(1) %out @@ -4473,37 +4605,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlanex16_b32_vss_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s3, s6 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i32 %v, ptr addrspace(1) %out @@ -4514,37 +4646,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f ; GFX10-LABEL: v_permlanex16_b32_vss_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s3, s6 op_sel:[0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true) store float %v, ptr addrspace(1) %out @@ -4555,85 +4687,85 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s6, s7 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s6, s7 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s6, s7 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s6, s7 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true) store i64 %v, ptr addrspace(1) %out @@ -4644,85 +4776,85 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s6, s7 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s6, s7 op_sel:[0,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s6, s7 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s6, s7 op_sel:[0,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true) store double %v, ptr addrspace(1) %out @@ -4733,37 +4865,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s3, s6 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i32 %v, ptr addrspace(1) %out @@ -4774,37 +4906,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s3, s6 op_sel:[1,1] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true) store float %v, ptr addrspace(1) %out @@ -4815,85 +4947,85 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s6, s7 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s6, s7 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s6, s7 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s6, s7 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true) store i64 %v, ptr addrspace(1) %out @@ -4904,85 +5036,85 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s6, s7 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s6, s7 op_sel:[1,1] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s6, s7 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s6, s7 op_sel:[1,1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true) store double %v, ptr addrspace(1) %out @@ -4993,19 +5125,19 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlane16_b32_tid_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_tid_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5016,8 +5148,8 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlane16_b32_tid_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5034,19 +5166,19 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlane16_b32_tid_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_tid_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5057,8 +5189,8 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlane16_b32_tid_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5076,34 +5208,34 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -5116,8 +5248,8 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -5130,8 +5262,8 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -5144,8 +5276,8 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -5166,34 +5298,34 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -5207,8 +5339,8 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -5222,8 +5354,8 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -5237,8 +5369,8 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -5259,19 +5391,19 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_undef_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_undef_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5282,8 +5414,8 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_undef_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5301,19 +5433,19 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_undef_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_undef_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5324,8 +5456,8 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_undef_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5344,32 +5476,32 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 ; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -5382,8 +5514,8 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -5396,8 +5528,8 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -5410,8 +5542,8 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -5433,34 +5565,34 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 ; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -5474,8 +5606,8 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -5489,8 +5621,8 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -5504,8 +5636,8 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -5527,32 +5659,32 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -5564,8 +5696,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -5578,8 +5710,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -5591,8 +5723,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -5611,32 +5743,32 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -5648,8 +5780,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -5662,8 +5794,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -5675,8 +5807,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -5696,36 +5828,36 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 ; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -5738,8 +5870,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -5752,8 +5884,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -5766,8 +5898,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -5788,38 +5920,38 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 ; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 ; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 @@ -5834,8 +5966,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 @@ -5850,8 +5982,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 @@ -5866,8 +5998,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 @@ -5889,19 +6021,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5912,8 +6044,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5931,19 +6063,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5954,8 +6086,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5974,32 +6106,32 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -6012,8 +6144,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -6026,8 +6158,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -6040,8 +6172,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -6063,34 +6195,34 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -6104,8 +6236,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -6119,8 +6251,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -6134,8 +6266,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -6157,19 +6289,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6180,8 +6312,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6199,19 +6331,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6222,8 +6354,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6242,32 +6374,32 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -6280,8 +6412,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -6294,8 +6426,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -6308,8 +6440,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -6331,34 +6463,34 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -6372,8 +6504,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -6387,8 +6519,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -6402,8 +6534,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -6425,19 +6557,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6448,8 +6580,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6467,19 +6599,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6490,8 +6622,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6510,32 +6642,32 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -6548,8 +6680,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -6562,8 +6694,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -6576,8 +6708,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -6599,34 +6731,34 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -6640,8 +6772,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -6655,8 +6787,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -6670,8 +6802,8 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -6693,19 +6825,19 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_tid_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_tid_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6716,8 +6848,8 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_tid_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6734,19 +6866,19 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_tid_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_tid_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6757,8 +6889,8 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_tid_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6776,34 +6908,34 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -6816,8 +6948,8 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -6830,8 +6962,8 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -6844,8 +6976,8 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -6866,34 +6998,34 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -6907,8 +7039,8 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -6922,8 +7054,8 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -6937,8 +7069,8 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -6959,19 +7091,19 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_undef_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_undef_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6982,8 +7114,8 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_undef_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7001,19 +7133,19 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_undef_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_undef_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7024,8 +7156,8 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_undef_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7044,32 +7176,32 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 ; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -7082,8 +7214,8 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -7096,8 +7228,8 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -7110,8 +7242,8 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -7133,34 +7265,34 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 ; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -7174,8 +7306,8 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -7189,8 +7321,8 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -7204,8 +7336,8 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -7227,32 +7359,32 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -7264,8 +7396,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -7278,8 +7410,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -7291,8 +7423,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -7311,32 +7443,32 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] +; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -7348,8 +7480,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -7362,8 +7494,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -7375,8 +7507,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -7396,36 +7528,36 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 ; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -7438,8 +7570,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -7452,8 +7584,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -7466,8 +7598,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -7488,38 +7620,38 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 ; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 ; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 @@ -7534,8 +7666,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 @@ -7550,8 +7682,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 @@ -7566,8 +7698,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 @@ -7589,19 +7721,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7612,8 +7744,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7631,19 +7763,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7654,8 +7786,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7674,32 +7806,32 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] ; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -7712,8 +7844,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -7726,8 +7858,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -7740,8 +7872,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -7763,34 +7895,34 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] ; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -7804,8 +7936,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -7819,8 +7951,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -7834,8 +7966,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -7857,19 +7989,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7880,8 +8012,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7899,19 +8031,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7922,8 +8054,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -7942,32 +8074,32 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] ; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -7980,8 +8112,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -7994,8 +8126,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -8008,8 +8140,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -8031,34 +8163,34 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] ; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -8072,8 +8204,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -8087,8 +8219,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -8102,8 +8234,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -8125,19 +8257,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8148,8 +8280,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8167,19 +8299,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8190,8 +8322,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8210,32 +8342,32 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] ; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -8248,8 +8380,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -8262,8 +8394,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -8276,8 +8408,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -8299,34 +8431,34 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] ; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] ; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -8340,8 +8472,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -8355,8 +8487,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -8370,8 +8502,8 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll index 59be9f8641c1ba..33f0d60585e959 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -21,7 +21,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -37,7 +37,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -48,7 +48,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -64,7 +64,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -75,7 +75,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -91,7 +91,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -102,7 +102,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -119,7 +119,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -130,7 +130,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -146,7 +146,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -157,7 +157,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -173,7 +173,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -184,7 +184,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -200,7 +200,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -211,7 +211,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -227,7 +227,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -238,7 +238,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -254,7 +254,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -265,7 +265,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -281,7 +281,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -292,7 +292,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -309,7 +309,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -320,7 +320,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -336,7 +336,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -347,7 +347,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -363,7 +363,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -374,7 +374,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -391,11 +391,11 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlane16var_b32_tid_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -404,10 +404,10 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlane16var_b32_tid_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -423,11 +423,11 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16var_b32_undef_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -436,10 +436,10 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16var_b32_undef_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -456,11 +456,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] @@ -469,12 +469,12 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -490,11 +490,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -503,10 +503,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -523,11 +523,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -536,10 +536,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -556,11 +556,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -569,10 +569,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -589,11 +589,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlanex16var_b32_tid_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -602,10 +602,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlanex16var_b32_tid_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -621,11 +621,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_undef_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -634,10 +634,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_undef_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -654,11 +654,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] @@ -667,12 +667,12 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -688,11 +688,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -701,10 +701,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -721,11 +721,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -734,10 +734,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -754,11 +754,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -767,10 +767,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll index 216731519731a0..f23f9595446eb5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -9,10 +9,10 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) { ; GFX11-LABEL: test_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -25,7 +25,7 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) { define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { ; GFX11-LABEL: test_i: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 @@ -40,7 +40,7 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX11-LABEL: test_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll index 22c369e2da72e4..f7c37caf41eab1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) { ; GFX11-SDAG-LABEL: test_p0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -22,14 +22,14 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) { ; GFX11-SDAG-LABEL: test_v3p0: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x2 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x44 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x54 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s6 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v7, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v1 ; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v4 @@ -38,8 +38,8 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) { ; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v0 ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v7 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[2:3] offset:16 -; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[2:3] +; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr> @llvm.amdgcn.permlane64.v3p0(<3 x ptr> %src0) store <3 x ptr> %v, ptr addrspace(1) %out @@ -50,10 +50,10 @@ define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0 ; GFX11-SDAG-LABEL: test_p3: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] @@ -67,18 +67,18 @@ define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3 ; GFX11-SDAG-LABEL: test_v3p3: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 ; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 -; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] +; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane64.v3p3(<3 x ptr addrspace(3)> %src0) store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out @@ -89,10 +89,10 @@ define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0 ; GFX11-SDAG-LABEL: test_p5: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] @@ -106,18 +106,18 @@ define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5 ; GFX11-SDAG-LABEL: test_v3p5: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 ; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 -; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] +; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane64.v3p5(<3 x ptr addrspace(5)> %src0) store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out @@ -128,10 +128,10 @@ define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0 ; GFX11-SDAG-LABEL: test_p6: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] @@ -145,18 +145,18 @@ define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6 ; GFX11-SDAG-LABEL: test_v3p6: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 ; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 -; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] +; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane64.v3p6(<3 x ptr addrspace(6)> %src0) store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll index 419e19083f85e3..7bc3864ef5e12a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB0_1: ; %bb1 @@ -34,7 +34,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_i32_off: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB1_1: ; %bb1 @@ -62,7 +62,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_i32_soff: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB2_1: ; %bb1 @@ -90,7 +90,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_i32_dlc: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB3_1: ; %bb1 @@ -119,7 +119,7 @@ bb2: define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) { ; CHECK-LABEL: raw_nonatomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc @@ -149,7 +149,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_i64: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB5_1: ; %bb1 @@ -179,7 +179,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_v2i16: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB6_1: ; %bb1 @@ -209,7 +209,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_v4i16: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB7_1: ; %bb1 @@ -243,7 +243,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_v4i32: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB8_1: ; %bb1 @@ -273,7 +273,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_ptr: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB9_1: ; %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll index 9f0b420a0a828d..5c0e34c5e2ec0b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -5,11 +5,7 @@ define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_ ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen offset:24 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 24 @@ -21,11 +17,7 @@ define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sg ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 +; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[16:19], s20 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) @@ -36,11 +28,7 @@ define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffse ; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], s20 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -51,11 +39,7 @@ define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__ ; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[16:19], s20 offset:92 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) @@ -66,11 +50,7 @@ define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_ ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen slc +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen slc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll index 6541ac9553231c..8c0138aeb0ba01 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_ptr_load_i32: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB0_1: ; %bb1 @@ -34,7 +34,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_off: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB1_1: ; %bb1 @@ -62,7 +62,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_soff: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB2_1: ; %bb1 @@ -90,7 +90,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_dlc: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB3_1: ; %bb1 @@ -119,7 +119,7 @@ bb2: define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_nonptr_atomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc @@ -149,7 +149,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_i64: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB5_1: ; %bb1 @@ -179,7 +179,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_v2i16: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB6_1: ; %bb1 @@ -209,7 +209,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i16: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB7_1: ; %bb1 @@ -243,7 +243,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i32: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB8_1: ; %bb1 @@ -273,7 +273,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_ptr: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB9_1: ; %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll index 8a0602e0472b53..22a473e44b273d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll @@ -10,7 +10,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s6 offen offset:128 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen offset:128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 128 @@ -26,7 +26,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2) @@ -41,7 +41,7 @@ define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s6 offen offset:128 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen offset:128 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 128 %unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -56,7 +56,7 @@ define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll index ce46e2755ae582..206cc9f2ec28d3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll @@ -8,29 +8,21 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen scc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen sc1 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -41,7 +33,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen scope:SCOPE_SYS +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24) ret void @@ -51,29 +43,21 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset ; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 +; GFX908-NEXT: buffer_atomic_add_f32 v0, off, s[16:19], s20 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[16:19], s20 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 +; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -84,7 +68,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 +; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void @@ -94,29 +78,21 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX908-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], s20 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], s20 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -127,7 +103,7 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -137,29 +113,21 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffs ; GFX908-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, off, s[16:19], s20 offset:92 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[16:19], s20 offset:92 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -170,7 +138,7 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0) ret void @@ -180,29 +148,21 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen slc +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen slc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen slc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -213,7 +173,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll index 327d80a7b67cdc..4a87ca8ad42fd6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll @@ -7,18 +7,14 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen glc scc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen sc0 sc1 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -29,7 +25,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24) @@ -40,18 +36,14 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset_ ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 glc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[16:19], s20 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -62,7 +54,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) @@ -73,18 +65,14 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgp ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen glc +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], s20 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -95,7 +83,7 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgp ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -106,18 +94,14 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_v ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 glc +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[16:19], s20 offset:92 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -128,7 +112,7 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0) @@ -139,18 +123,14 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen glc slc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen sc0 nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen sc0 nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -161,7 +141,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll index 3ecbe3c71d0222..3540468566147a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll @@ -9,11 +9,7 @@ define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -21,33 +17,21 @@ define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s11, s17 -; GFX9-NEXT: s_mov_b32 s10, s16 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -65,11 +49,7 @@ define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX7-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -78,33 +58,21 @@ define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s11, s17 -; GFX9-NEXT: s_mov_b32 s10, s16 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -122,11 +90,7 @@ define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 @@ -137,33 +101,21 @@ define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s11, s17 -; GFX9-NEXT: s_mov_b32 s10, s16 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -187,11 +139,7 @@ define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[16:19], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 @@ -206,33 +154,21 @@ define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s11, s17 -; GFX9-NEXT: s_mov_b32 s10, s16 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll index cc1547eaad8309..d600d6e7591240 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -1180,22 +1180,14 @@ define double @buffer_load_f64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %v ; PREGFX10-LABEL: buffer_load_f64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_f64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1214,22 +1206,14 @@ define <2 x double> @buffer_load_v2f64__voffset_add(ptr addrspace(8) inreg %rsrc ; PREGFX10-LABEL: buffer_load_v2f64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2f64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1248,22 +1232,14 @@ define i64 @buffer_load_i64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voff ; PREGFX10-LABEL: buffer_load_i64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_i64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1282,22 +1258,14 @@ define <2 x i64> @buffer_load_v2i64__voffset_add(ptr addrspace(8) inreg %rsrc, i ; PREGFX10-LABEL: buffer_load_v2i64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2i64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1316,22 +1284,14 @@ define ptr @buffer_load_p0__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffs ; PREGFX10-LABEL: buffer_load_p0__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p0__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1350,22 +1310,14 @@ define <2 x ptr> @buffer_load_v2p0__voffset_add(ptr addrspace(8) inreg %rsrc, i3 ; PREGFX10-LABEL: buffer_load_v2p0__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p0__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1384,22 +1336,14 @@ define ptr addrspace(1) @buffer_load_p1__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p1__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p1__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1418,22 +1362,14 @@ define <2 x ptr addrspace(1)> @buffer_load_v2p1__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p1__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p1__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1452,22 +1388,14 @@ define ptr addrspace(4) @buffer_load_p4__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p4__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p4__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1486,22 +1414,14 @@ define <2 x ptr addrspace(4)> @buffer_load_v2p4__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p4__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p4__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1520,22 +1440,14 @@ define ptr addrspace(999) @buffer_load_p999__voffset_add(ptr addrspace(8) inreg ; PREGFX10-LABEL: buffer_load_p999__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p999__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1554,22 +1466,14 @@ define <2 x ptr addrspace(999)> @buffer_load_v2p999__voffset_add(ptr addrspace(8 ; PREGFX10-LABEL: buffer_load_v2p999__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p999__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1588,22 +1492,14 @@ define ptr addrspace(2) @buffer_load_p2__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p2__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1622,22 +1518,14 @@ define <2 x ptr addrspace(2)> @buffer_load_v2p2__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p2__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1656,11 +1544,7 @@ define <3 x ptr addrspace(2)> @buffer_load_v3p2__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1679,22 +1563,14 @@ define <4 x ptr addrspace(2)> @buffer_load_v4p2__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p2__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1713,22 +1589,14 @@ define ptr addrspace(3) @buffer_load_p3__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p3__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1747,22 +1615,14 @@ define <2 x ptr addrspace(3)> @buffer_load_v2p3__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p3__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1781,11 +1641,7 @@ define <3 x ptr addrspace(3)> @buffer_load_v3p3__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1804,22 +1660,14 @@ define <4 x ptr addrspace(3)> @buffer_load_v4p3__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p3__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1838,22 +1686,14 @@ define ptr addrspace(5) @buffer_load_p5__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p5__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,22 +1712,14 @@ define <2 x ptr addrspace(5)> @buffer_load_v2p5__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p5__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1906,11 +1738,7 @@ define <3 x ptr addrspace(5)> @buffer_load_v3p5__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1929,22 +1757,14 @@ define <4 x ptr addrspace(5)> @buffer_load_v4p5__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p5__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1963,22 +1783,14 @@ define ptr addrspace(6) @buffer_load_p6__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p6__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1997,22 +1809,14 @@ define <2 x ptr addrspace(6)> @buffer_load_v2p6__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p6__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2031,11 +1835,7 @@ define <3 x ptr addrspace(6)> @buffer_load_v3p6__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2054,22 +1854,14 @@ define <4 x ptr addrspace(6)> @buffer_load_v4p6__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p6__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[16:19], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll index 855ca390aabdce..de1f859132e61a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -599,22 +599,14 @@ define void @buffer_store_f64__voffset_add(ptr addrspace(8) inreg %rsrc, double ; VERDE-LABEL: buffer_store_f64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_f64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -626,22 +618,14 @@ define void @buffer_store_v2f64__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x ; VERDE-LABEL: buffer_store_v2f64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2f64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -653,22 +637,14 @@ define void @buffer_store_i64__voffset_add(ptr addrspace(8) inreg %rsrc, i64 %da ; VERDE-LABEL: buffer_store_i64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_i64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -680,22 +656,14 @@ define void @buffer_store_v2i64__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x ; VERDE-LABEL: buffer_store_v2i64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2i64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -707,22 +675,14 @@ define void @buffer_store_p0__voffset_add(ptr addrspace(8) inreg %rsrc, ptr %dat ; VERDE-LABEL: buffer_store_p0__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p0__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -734,22 +694,14 @@ define void @buffer_store_v2p0__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p0__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p0__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -761,22 +713,14 @@ define void @buffer_store_p1__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p1__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p1__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -788,22 +732,14 @@ define void @buffer_store_v2p1__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p1__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p1__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -815,22 +751,14 @@ define void @buffer_store_p4__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p4__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p4__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -842,22 +770,14 @@ define void @buffer_store_v2p4__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p4__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p4__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -869,22 +789,14 @@ define void @buffer_store_p999__voffset_add(ptr addrspace(8) inreg %rsrc, ptr ad ; VERDE-LABEL: buffer_store_p999__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p999__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -896,22 +808,14 @@ define void @buffer_store_v2p999__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x ; VERDE-LABEL: buffer_store_v2p999__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p999__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -923,22 +827,14 @@ define void @buffer_store_p2__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dword v0, v1, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v0, v1, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -950,22 +846,14 @@ define void @buffer_store_v2p2__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -977,22 +865,14 @@ define void @buffer_store_v3p2__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1004,22 +884,14 @@ define void @buffer_store_v4p2__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1031,22 +903,14 @@ define void @buffer_store_p3__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dword v0, v1, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v0, v1, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1058,22 +922,14 @@ define void @buffer_store_v2p3__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1085,22 +941,14 @@ define void @buffer_store_v3p3__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1112,22 +960,14 @@ define void @buffer_store_v4p3__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1139,22 +979,14 @@ define void @buffer_store_p5__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dword v0, v1, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v0, v1, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1166,22 +998,14 @@ define void @buffer_store_v2p5__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1193,22 +1017,14 @@ define void @buffer_store_v3p5__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1220,22 +1036,14 @@ define void @buffer_store_v4p5__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1247,22 +1055,14 @@ define void @buffer_store_p6__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dword v0, v1, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v0, v1, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1274,22 +1074,14 @@ define void @buffer_store_v2p6__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1301,22 +1093,14 @@ define void @buffer_store_v3p6__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1328,22 +1112,14 @@ define void @buffer_store_v4p6__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[16:19], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll index af84994ca8c4c1..02fc82de5d7bc7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll @@ -8,39 +8,39 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[4:5], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s6, s[4:5], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -51,42 +51,42 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[4:5], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s4, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s6, s[4:5], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -97,50 +97,50 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s7, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 -; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] +; PREGFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 +; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -152,49 +152,49 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s4, s7, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s7, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 -; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] ; GFX11-PACKED-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index eb349cbbe44d17..63b139bb25e775 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -10,49 +10,49 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[4:5], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s6, s[4:5], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-LABEL: tbuffer_store_d16_x: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] ; GFX12-PACKED-NEXT: s_endpgm main_body: @@ -63,52 +63,52 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[4:5], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s4, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s6, s[4:5], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] ; GFX12-PACKED-NEXT: s_endpgm main_body: @@ -119,75 +119,75 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s7, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 -; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] +; PREGFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 +; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-SDAG: ; %bb.0: ; %main_body ; GFX12-PACKED-SDAG-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff -; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX12-PACKED-SDAG-NEXT: s_and_b32 s4, s7, 0xffff +; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-PACKED-SDAG-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] ; GFX12-PACKED-SDAG-NEXT: s_endpgm ; ; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body ; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 -; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4 +; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s6, s6, s6 ; GFX12-PACKED-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] ; GFX12-PACKED-GISEL-NEXT: s_endpgm main_body: @@ -199,60 +199,60 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s4, s7, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s7, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 -; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] ; GFX12-PACKED-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 39a3b1c8adc9f1..9a2f0aa5adb772 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -31,7 +31,7 @@ define void @test_readfirstlane_i1_inreg(ptr addrspace(1) %out, i1 inreg %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_i1_inreg: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: s_and_b32 s4, s6, 1 +; CHECK-SDAG-NEXT: s_and_b32 s4, s16, 1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -40,7 +40,7 @@ define void @test_readfirstlane_i1_inreg(ptr addrspace(1) %out, i1 inreg %src) { ; CHECK-GISEL-LABEL: test_readfirstlane_i1_inreg: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: s_and_b32 s4, s6, 1 +; CHECK-GISEL-NEXT: s_and_b32 s4, s16, 1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2 ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -258,7 +258,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -268,7 +268,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -283,7 +283,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -294,7 +294,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -311,7 +311,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -322,7 +322,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -340,7 +340,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_m0: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -353,7 +353,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; ; CHECK-GISEL-LABEL: test_readfirstlane_m0: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -372,7 +372,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -385,7 +385,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -404,7 +404,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -418,7 +418,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -438,7 +438,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -452,7 +452,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -472,7 +472,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_fi: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s15 +; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s17 ; CHECK-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-SDAG-NEXT: s_mov_b32 s4, 0 ; CHECK-SDAG-NEXT: ;;#ASMSTART @@ -482,7 +482,7 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) { ; ; CHECK-GISEL-LABEL: test_readfirstlane_fi: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_add_u32 s0, s0, s15 +; CHECK-GISEL-NEXT: s_add_u32 s0, s0, s17 ; CHECK-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-GISEL-NEXT: s_mov_b32 s4, 0 ; CHECK-GISEL-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 24a332fa211c15..a8560ff1aa2b0c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -9,7 +9,7 @@ declare double @llvm.amdgcn.readlane.f64(double, i32) #0 define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s0 @@ -32,7 +32,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s[0:1] @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[0:1] @@ -55,7 +55,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s[0:1] @@ -64,7 +64,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[0:1] @@ -78,7 +78,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[6:7], 0x4 +; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x4 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -91,7 +91,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s0, s[6:7], 0x4 +; CHECK-GISEL-NEXT: s_load_dword s0, s[8:9], 0x4 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -110,7 +110,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[6:7], 0x8 +; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -124,7 +124,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s1, s[6:7], 0x8 +; CHECK-GISEL-NEXT: s_load_dword s1, s[8:9], 0x8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -144,7 +144,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[6:7], 0x8 +; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -158,7 +158,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s1, s[6:7], 0x8 +; CHECK-GISEL-NEXT: s_load_dword s1, s[8:9], 0x8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -178,7 +178,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -188,7 +188,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 ; ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -214,7 +214,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -231,7 +231,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -242,7 +242,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -260,7 +260,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; CHECK-SDAG-LABEL: test_readlane_vregs_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -279,7 +279,7 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; ; CHECK-GISEL-LABEL: test_readlane_vregs_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -309,7 +309,7 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; CHECK-SDAG-LABEL: test_readlane_vregs_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -330,7 +330,7 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; ; CHECK-GISEL-LABEL: test_readlane_vregs_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -363,7 +363,7 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; CHECK-SDAG-LABEL: test_readlane_vregs_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -384,7 +384,7 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; ; CHECK-GISEL-LABEL: test_readlane_vregs_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -418,7 +418,7 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_m0_sreg: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -431,7 +431,7 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; ; CHECK-GISEL-LABEL: test_readlane_m0_sreg: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -450,7 +450,7 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -464,7 +464,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -484,7 +484,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -500,7 +500,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -522,7 +522,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -538,7 +538,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -560,7 +560,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -573,7 +573,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -592,7 +592,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -606,7 +606,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -626,7 +626,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -640,7 +640,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index ce536a36b76044..f72f1e52d135fe 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -18,8 +18,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -29,10 +29,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX8GISEL-LABEL: uniform_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -40,85 +40,85 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9DAGISEL-LABEL: uniform_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9DAGISEL-NEXT: s_endpgm ; ; GFX9GISEL-LABEL: uniform_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9GISEL-NEXT: s_endpgm ; ; GFX10DAGISEL-LABEL: uniform_value: ; GFX10DAGISEL: ; %bb.0: ; %entry ; GFX10DAGISEL-NEXT: s_clause 0x1 -; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10DAGISEL-NEXT: s_endpgm ; ; GFX10GISEL-LABEL: uniform_value: ; GFX10GISEL: ; %bb.0: ; %entry ; GFX10GISEL-NEXT: s_clause 0x1 -; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: uniform_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry ; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1164DAGISEL-NEXT: s_endpgm ; ; GFX1164GISEL-LABEL: uniform_value: ; GFX1164GISEL: ; %bb.0: ; %entry ; GFX1164GISEL-NEXT: s_clause 0x1 -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1164GISEL-NEXT: s_endpgm ; ; GFX1132DAGISEL-LABEL: uniform_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; ; GFX1132GISEL-LABEL: uniform_value: ; GFX1132GISEL: ; %bb.0: ; %entry ; GFX1132GISEL-NEXT: s_clause 0x1 -; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_endpgm entry: @@ -130,7 +130,7 @@ entry: define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: const_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -140,7 +140,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX8GISEL-LABEL: const_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -150,7 +150,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9DAGISEL-LABEL: const_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -159,7 +159,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9GISEL-LABEL: const_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -168,7 +168,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10DAGISEL-LABEL: const_value: ; GFX10DAGISEL: ; %bb.0: ; %entry -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -177,7 +177,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10GISEL-LABEL: const_value: ; GFX10GISEL: ; %bb.0: ; %entry -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164DAGISEL-LABEL: const_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -195,7 +195,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164GISEL-LABEL: const_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -204,7 +204,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132DAGISEL-LABEL: const_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -212,7 +212,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132GISEL-LABEL: const_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -264,7 +264,7 @@ entry: define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -284,7 +284,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX8GISEL-LABEL: divergent_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s4, 0 ; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -304,7 +304,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9DAGISEL-LABEL: divergent_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 @@ -323,7 +323,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9GISEL-LABEL: divergent_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s4, 0 ; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -342,7 +342,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1064DAGISEL-LABEL: divergent_value: ; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 @@ -361,7 +361,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1064GISEL-LABEL: divergent_value: ; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -380,7 +380,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1032DAGISEL-LABEL: divergent_value: ; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 @@ -399,7 +399,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1032GISEL-LABEL: divergent_value: ; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -418,7 +418,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164DAGISEL-LABEL: divergent_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -440,7 +440,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164GISEL-LABEL: divergent_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 @@ -462,7 +462,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132DAGISEL-LABEL: divergent_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 @@ -483,7 +483,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132GISEL-LABEL: divergent_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 @@ -512,33 +512,33 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if -; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 ; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -553,26 +553,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_mov_b32 s6, s4 +; GFX8GISEL-NEXT: s_mov_b32 s6, s2 ; GFX8GISEL-NEXT: .LBB4_2: ; %Flow ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if -; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, 0 ; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8GISEL-NEXT: .LBB4_5: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -583,33 +583,33 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-LABEL: divergent_cfg: ; GFX9DAGISEL: ; %bb.0: ; %entry ; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if -; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 ; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -623,26 +623,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_mov_b32 s6, s4 +; GFX9GISEL-NEXT: s_mov_b32 s6, s2 ; GFX9GISEL-NEXT: .LBB4_2: ; %Flow ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if -; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9GISEL-NEXT: .LBB4_5: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -652,33 +652,33 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-LABEL: divergent_cfg: ; GFX1064DAGISEL: ; %bb.0: ; %entry ; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 ; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -692,26 +692,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 ; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if -; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 ; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064GISEL-NEXT: .LBB4_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -725,7 +725,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 @@ -734,20 +734,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 ; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_max_u32 s1, s1, s6 -; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -761,7 +761,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else -; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 @@ -769,57 +769,57 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if -; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 ; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_max_u32 s0, s0, s6 -; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032GISEL-NEXT: .LBB4_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX1032GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: divergent_cfg: ; GFX1164DAGISEL: ; %bb.0: ; %entry ; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec -; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 ; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -835,27 +835,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 ; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if -; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 ; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164GISEL-NEXT: .LBB4_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -871,7 +871,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 @@ -880,21 +880,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 ; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_max_u32 s1, s1, s6 -; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -910,7 +910,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else -; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 @@ -918,19 +918,19 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if -; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 ; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_max_u32 s0, s0, s6 -; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132GISEL-NEXT: .LBB4_5: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index ffb27f40209048..4551c60770bdf5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -19,8 +19,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -30,10 +30,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX8GISEL-LABEL: uniform_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -41,85 +41,85 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9DAGISEL-LABEL: uniform_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9DAGISEL-NEXT: s_endpgm ; ; GFX9GISEL-LABEL: uniform_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9GISEL-NEXT: s_endpgm ; ; GFX10DAGISEL-LABEL: uniform_value: ; GFX10DAGISEL: ; %bb.0: ; %entry ; GFX10DAGISEL-NEXT: s_clause 0x1 -; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10DAGISEL-NEXT: s_endpgm ; ; GFX10GISEL-LABEL: uniform_value: ; GFX10GISEL: ; %bb.0: ; %entry ; GFX10GISEL-NEXT: s_clause 0x1 -; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: uniform_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry ; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1164DAGISEL-NEXT: s_endpgm ; ; GFX1164GISEL-LABEL: uniform_value: ; GFX1164GISEL: ; %bb.0: ; %entry ; GFX1164GISEL-NEXT: s_clause 0x1 -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1164GISEL-NEXT: s_endpgm ; ; GFX1132DAGISEL-LABEL: uniform_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; ; GFX1132GISEL-LABEL: uniform_value: ; GFX1132GISEL: ; %bb.0: ; %entry ; GFX1132GISEL-NEXT: s_clause 0x1 -; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_endpgm entry: @@ -131,7 +131,7 @@ entry: define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: const_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -141,7 +141,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX8GISEL-LABEL: const_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -151,7 +151,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9DAGISEL-LABEL: const_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -160,7 +160,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9GISEL-LABEL: const_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -169,7 +169,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10DAGISEL-LABEL: const_value: ; GFX10DAGISEL: ; %bb.0: ; %entry -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -178,7 +178,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10GISEL-LABEL: const_value: ; GFX10GISEL: ; %bb.0: ; %entry -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -187,7 +187,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164DAGISEL-LABEL: const_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -196,7 +196,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164GISEL-LABEL: const_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -205,7 +205,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132DAGISEL-LABEL: const_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -213,7 +213,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132GISEL-LABEL: const_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -265,7 +265,7 @@ entry: define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: divergent_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -285,7 +285,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX8GISEL-LABEL: divergent_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s4, -1 ; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -305,7 +305,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX9DAGISEL-LABEL: divergent_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1 @@ -324,7 +324,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX9GISEL-LABEL: divergent_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s4, -1 ; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -343,7 +343,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1064DAGISEL-LABEL: divergent_value: ; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1 @@ -362,7 +362,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1064GISEL-LABEL: divergent_value: ; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s4, -1 ; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -381,7 +381,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1032DAGISEL-LABEL: divergent_value: ; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, -1 @@ -400,7 +400,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1032GISEL-LABEL: divergent_value: ; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, -1 ; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -419,7 +419,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1164DAGISEL-LABEL: divergent_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -441,7 +441,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1164GISEL-LABEL: divergent_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s4, -1 @@ -463,7 +463,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1132DAGISEL-LABEL: divergent_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, -1 @@ -484,7 +484,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1132GISEL-LABEL: divergent_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, -1 @@ -513,33 +513,33 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if -; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, -1 ; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_min_u32 s6, s6, s8 -; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -554,26 +554,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_mov_b32 s6, s4 +; GFX8GISEL-NEXT: s_mov_b32 s6, s2 ; GFX8GISEL-NEXT: .LBB4_2: ; %Flow ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if -; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, -1 ; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_min_u32 s6, s6, s8 -; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8GISEL-NEXT: .LBB4_5: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -584,33 +584,33 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-LABEL: divergent_cfg: ; GFX9DAGISEL: ; %bb.0: ; %entry ; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if -; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, -1 ; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_min_u32 s6, s6, s8 -; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -624,26 +624,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_mov_b32 s6, s4 +; GFX9GISEL-NEXT: s_mov_b32 s6, s2 ; GFX9GISEL-NEXT: .LBB4_2: ; %Flow ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if -; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, -1 ; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_min_u32 s6, s6, s8 -; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9GISEL-NEXT: .LBB4_5: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -653,33 +653,33 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-LABEL: divergent_cfg: ; GFX1064DAGISEL: ; %bb.0: ; %entry ; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, -1 ; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_min_u32 s6, s6, s8 -; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -693,26 +693,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 ; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if -; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, -1 ; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_min_u32 s6, s6, s8 -; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064GISEL-NEXT: .LBB4_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -726,7 +726,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 @@ -735,20 +735,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1 ; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_min_u32 s1, s1, s6 -; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -762,7 +762,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else -; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 @@ -770,57 +770,57 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if -; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s0, -1 ; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 -; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_min_u32 s0, s0, s6 -; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032GISEL-NEXT: .LBB4_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX1032GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: divergent_cfg: ; GFX1164DAGISEL: ; %bb.0: ; %entry ; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec -; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1 ; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_min_u32 s6, s6, s8 -; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -836,27 +836,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 +; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 ; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if -; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, -1 ; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[4:5] +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX1164GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_min_u32 s6, s6, s8 -; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164GISEL-NEXT: .LBB4_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -872,7 +872,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 @@ -881,21 +881,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1 ; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_min_u32 s1, s1, s6 -; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -911,7 +911,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else -; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 @@ -919,19 +919,19 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if -; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s0, -1 ; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 -; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_min_u32 s0, s0, s6 -; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132GISEL-NEXT: .LBB4_5: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 8d99ec2e1b709f..e4b9299869334c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT0-LABEL: test_barrier: ; VARIANT0: ; %bb.0: ; %entry -; VARIANT0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; VARIANT0-NEXT: s_load_dword s4, s[2:3], 0xb +; VARIANT0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; VARIANT0-NEXT: s_load_dword s4, s[4:5], 0xb ; VARIANT0-NEXT: s_mov_b32 s3, 0xf000 ; VARIANT0-NEXT: s_mov_b32 s2, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -31,8 +31,8 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT1-LABEL: test_barrier: ; VARIANT1: ; %bb.0: ; %entry -; VARIANT1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; VARIANT1-NEXT: s_load_dword s4, s[2:3], 0xb +; VARIANT1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; VARIANT1-NEXT: s_load_dword s4, s[4:5], 0xb ; VARIANT1-NEXT: s_mov_b32 s3, 0xf000 ; VARIANT1-NEXT: s_mov_b32 s2, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -52,12 +52,12 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT2-LABEL: test_barrier: ; VARIANT2: ; %bb.0: ; %entry -; VARIANT2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VARIANT2-NEXT: s_load_dword s4, s[2:3], 0x2c +; VARIANT2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VARIANT2-NEXT: s_load_dword s2, s[4:5], 0x2c ; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT2-NEXT: global_store_dword v2, v0, s[0:1] -; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s4 +; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s2 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VARIANT2-NEXT: v_mov_b32_e32 v3, s1 @@ -72,12 +72,12 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry -; VARIANT3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VARIANT3-NEXT: s_load_dword s4, s[2:3], 0x2c +; VARIANT3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VARIANT3-NEXT: s_load_dword s2, s[4:5], 0x2c ; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT3-NEXT: global_store_dword v2, v0, s[0:1] -; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s4 +; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s2 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VARIANT3-NEXT: v_mov_b32_e32 v3, s1 @@ -91,7 +91,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT4-LABEL: test_barrier: ; VARIANT4: ; %bb.0: ; %entry -; VARIANT4-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; VARIANT4-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; VARIANT4-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) ; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v2 @@ -114,7 +114,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT5-LABEL: test_barrier: ; VARIANT5: ; %bb.0: ; %entry -; VARIANT5-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; VARIANT5-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; VARIANT5-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) ; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v2 @@ -136,7 +136,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT6-LABEL: test_barrier: ; VARIANT6: ; %bb.0: ; %entry -; VARIANT6-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; VARIANT6-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; VARIANT6-NEXT: s_wait_kmcnt 0x0 ; VARIANT6-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_and_b32 v4, 0x3ff, v0 ; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll index bc7052132a87b0..cf86e2e1dedee4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll @@ -37,7 +37,7 @@ define void @test_s_sleep_var2() { define amdgpu_kernel void @test_s_sleep_var3(i32 %arg) { ; GCN-LABEL: test_s_sleep_var3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GCN-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_sleep_var s0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index 527627a5a2f67d..46359f7e990599 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 @@ -73,7 +73,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 @@ -177,7 +177,7 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 @@ -259,7 +259,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll index a29e2298210a3a..dcc3e0df0c7443 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -7,7 +7,7 @@ declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16( define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GCN-NEXT: v_mov_b32_e32 v48, 0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -59,7 +59,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -149,7 +149,7 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v16, 0x3ff, v0 ; GCN-NEXT: v_mov_b32_e32 v18, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 @@ -214,7 +214,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved( ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index b8a4674833cee4..9377da89a6b05d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -29,19 +29,19 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] -; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 -; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(8) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 @@ -83,33 +83,33 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 ; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 -; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:112 -; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:80 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 -; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:32 -; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:16 -; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(30) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(7) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 @@ -151,14 +151,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:112 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:80 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:32 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(30) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm @@ -180,12 +180,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:16 -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) @@ -194,10 +194,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 -; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:112 +; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 ; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 ; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 ; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 @@ -208,10 +208,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:80 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:48 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:48 ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 @@ -228,8 +228,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v19, v19, v19 ; GCN-NEXT: v_mul_lo_u32 v18, v18, v18 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; GCN-NEXT: v_mul_lo_u32 v17, v17, v17 ; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 @@ -245,14 +245,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v22, v22, v22 ; GCN-NEXT: v_mul_lo_u32 v21, v21, v21 ; GCN-NEXT: v_mul_lo_u32 v20, v20, v20 -; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 -; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:80 -; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:48 -; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:32 -; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 -; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -261,12 +261,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) @@ -275,10 +275,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 @@ -289,10 +289,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:48 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 @@ -309,8 +309,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v19, v19, v19 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v18, v18, v18 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v17, v17, v17 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 @@ -326,14 +326,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v22, v22, v22 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:80 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:48 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:32 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -385,12 +385,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -398,8 +398,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 ; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -407,30 +407,30 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:112 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:112 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:96 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:96 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:80 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:80 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -442,8 +442,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -455,8 +455,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -468,18 +468,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) @@ -487,8 +487,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) @@ -496,30 +496,30 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:112 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:96 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:80 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -531,8 +531,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -544,8 +544,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -557,7 +557,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #2 @@ -620,7 +620,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -727,7 +727,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -870,7 +870,7 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 @@ -1198,22 +1198,22 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out, <5 x float> %in1) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 ; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, s4, v3 +; GCN-NEXT: v_mul_f32_e32 v4, s0, v3 ; GCN-NEXT: v_rndne_f32_e32 v5, v4 ; GCN-NEXT: v_sub_f32_e32 v6, v4, v5 -; GCN-NEXT: v_fma_f32 v4, s4, v3, -v4 -; GCN-NEXT: v_fmac_f32_e32 v4, s4, v7 +; GCN-NEXT: v_fma_f32 v4, s0, v3, -v4 +; GCN-NEXT: v_fmac_f32_e32 v4, s0, v7 ; GCN-NEXT: v_add_f32_e32 v4, v6, v4 ; GCN-NEXT: v_exp_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s6, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -1225,17 +1225,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_mov_b32_e32 v9, 1.0 ; GCN-NEXT: v_ldexp_f32 v4, v4, v5 ; GCN-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; GCN-NEXT: v_mul_f32_e32 v10, s5, v3 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 +; GCN-NEXT: v_mul_f32_e32 v10, s1, v3 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; GCN-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GCN-NEXT: v_rndne_f32_e32 v11, v10 ; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 ; GCN-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; GCN-NEXT: v_sub_f32_e32 v12, v10, v11 -; GCN-NEXT: v_fma_f32 v10, s5, v3, -v10 +; GCN-NEXT: v_fma_f32 v10, s1, v3, -v10 ; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_fmac_f32_e32 v10, s5, v7 +; GCN-NEXT: v_fmac_f32_e32 v10, s1, v7 ; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:8304 ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31] @@ -1250,17 +1250,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:8208 ; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192 ; GCN-NEXT: v_ldexp_f32 v4, v4, v10 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v5 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 ; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v6 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 ; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_mul_f32_e32 v10, s6, v3 +; GCN-NEXT: v_mul_f32_e32 v10, s2, v3 ; GCN-NEXT: v_rndne_f32_e32 v11, v10 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v4, a[32:63] -; GCN-NEXT: v_fma_f32 v4, s6, v3, -v10 +; GCN-NEXT: v_fma_f32 v4, s2, v3, -v10 ; GCN-NEXT: v_sub_f32_e32 v12, v10, v11 -; GCN-NEXT: v_fmac_f32_e32 v4, s6, v7 +; GCN-NEXT: v_fmac_f32_e32 v4, s2, v7 ; GCN-NEXT: v_add_f32_e32 v4, v12, v4 ; GCN-NEXT: v_exp_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_i32_f32_e32 v10, v11 @@ -1282,27 +1282,27 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:49168 ; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:49152 ; GCN-NEXT: v_ldexp_f32 v1, v4, v10 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6 -; GCN-NEXT: v_mul_f32_e32 v4, s7, v3 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 +; GCN-NEXT: v_mul_f32_e32 v4, s3, v3 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GCN-NEXT: v_rndne_f32_e32 v10, v4 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x54 +; GCN-NEXT: s_load_dword s8, s[4:5], 0x54 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] ; GCN-NEXT: v_sub_f32_e32 v1, v4, v10 -; GCN-NEXT: v_fma_f32 v4, s7, v3, -v4 -; GCN-NEXT: v_fmac_f32_e32 v4, s7, v7 +; GCN-NEXT: v_fma_f32 v4, s3, v3, -v4 +; GCN-NEXT: v_fmac_f32_e32 v4, s3, v7 ; GCN-NEXT: v_add_f32_e32 v1, v1, v4 ; GCN-NEXT: v_exp_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v4, v10 -; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 ; GCN-NEXT: ds_read_b128 a[156:159], v2 offset:57456 ; GCN-NEXT: ds_read_b128 a[152:155], v2 offset:57440 ; GCN-NEXT: v_ldexp_f32 v1, v1, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v6 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GCN-NEXT: v_mul_f32_e32 v4, s8, v3 ; GCN-NEXT: v_fma_f32 v3, s8, v3, -v4 @@ -1324,7 +1324,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v0, s7, v0 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] @@ -1335,8 +1335,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ; kill: killed $sgpr2_sgpr3 +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1383,22 +1383,22 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s4, v3 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s0, v3 ; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v5, v4 ; EXACTCUTOFF-NEXT: v_sub_f32_e32 v6, v4, v5 -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s4, v3, -v4 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s4, v7 +; EXACTCUTOFF-NEXT: v_fma_f32 v4, s0, v3, -v4 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s0, v7 ; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 ; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s6, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -1410,17 +1410,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v9, 1.0 ; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v5 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s5, v3 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s1, v3 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11 -; EXACTCUTOFF-NEXT: v_fma_f32 v10, s5, v3, -v10 +; EXACTCUTOFF-NEXT: v_fma_f32 v10, s1, v3, -v10 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v10, s5, v7 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v10, s1, v7 ; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:8304 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v9, v4, a[0:31] @@ -1435,17 +1435,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:8208 ; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:8192 ; EXACTCUTOFF-NEXT: v_ldexp_f32 v4, v4, v10 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v5 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v6 +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s6, v3 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v10, s2, v3 ; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v11, v10 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v9, v4, a[32:63] -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s6, v3, -v10 +; EXACTCUTOFF-NEXT: v_fma_f32 v4, s2, v3, -v10 ; EXACTCUTOFF-NEXT: v_sub_f32_e32 v12, v10, v11 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s6, v7 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s2, v7 ; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v12, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v10, v11 @@ -1467,27 +1467,27 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:49168 ; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:49152 ; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v4, v10 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6 -; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s7, v3 +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 +; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s3, v3 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v4 -; EXACTCUTOFF-NEXT: s_load_dword s8, s[2:3], 0x54 +; EXACTCUTOFF-NEXT: s_load_dword s8, s[4:5], 0x54 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] ; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v10 -; EXACTCUTOFF-NEXT: v_fma_f32 v4, s7, v3, -v4 -; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s7, v7 +; EXACTCUTOFF-NEXT: v_fma_f32 v4, s3, v3, -v4 +; EXACTCUTOFF-NEXT: v_fmac_f32_e32 v4, s3, v7 ; EXACTCUTOFF-NEXT: v_add_f32_e32 v1, v1, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v1, v1 ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v4, v10 -; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 +; EXACTCUTOFF-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 ; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v2 offset:57456 ; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v2 offset:57440 ; EXACTCUTOFF-NEXT: v_ldexp_f32 v1, v1, v4 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v6 +; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s8, v3 ; EXACTCUTOFF-NEXT: v_fma_f32 v3, s8, v3, -v4 @@ -1509,7 +1509,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s7, v0 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] @@ -1520,8 +1520,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr2_sgpr3 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s7 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll index a9823d89048bdc..9a001e0b803941 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_get_doorbell: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -14,7 +14,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_get_doorbell: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_get_ddid: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -37,7 +37,7 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_get_ddid: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -51,7 +51,7 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_tma: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -66,7 +66,7 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_realtime: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -81,7 +81,7 @@ define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) { define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_savewave: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -90,7 +90,7 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_savewave: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -104,7 +104,7 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_tba: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -119,7 +119,7 @@ define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_get_0_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -128,7 +128,7 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_get_0_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -142,7 +142,7 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_99999_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index 6fb5a9ce47a843..6cb2d6d55ea320 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -5,12 +5,12 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -26,7 +26,7 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -43,7 +43,7 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -70,7 +70,7 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, v0 @@ -89,17 +89,17 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NEXT: s_buffer_load_dword s7, s[0:3], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[2:3] ; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 56 +; GCN-NEXT: s_cmp_lg_u32 s7, 56 ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc1 .LBB4_3 @@ -141,12 +141,12 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] @@ -163,7 +163,7 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -192,12 +192,12 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] @@ -214,12 +214,12 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] @@ -236,7 +236,7 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -263,7 +263,7 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -290,12 +290,12 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] @@ -312,7 +312,7 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -340,7 +340,7 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -368,7 +368,7 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -396,7 +396,7 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -423,12 +423,12 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -444,12 +444,12 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -465,12 +465,12 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -486,12 +486,12 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll index 2a979976d806c9..259e3162e8bd1e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll @@ -6,11 +6,12 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %i ; CHECK-LABEL: struct_atomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB0_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc @@ -36,7 +37,7 @@ bb2: define amdgpu_kernel void @struct_atomic_buffer_load_i32_const_idx(<4 x i32> %addr) { ; CHECK-LABEL: struct_atomic_buffer_load_i32_const_idx: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB1_1: ; %bb1 @@ -66,11 +67,12 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i3 ; CHECK-LABEL: struct_atomic_buffer_load_i32_off: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB2_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc @@ -97,11 +99,12 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i ; CHECK-LABEL: struct_atomic_buffer_load_i32_soff: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB3_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc @@ -127,11 +130,12 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i3 ; CHECK-LABEL: struct_atomic_buffer_load_i32_dlc: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB4_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc @@ -158,10 +162,10 @@ define amdgpu_kernel void @struct_nonatomic_buffer_load_i32(<4 x i32> %addr, i32 ; CHECK-LABEL: struct_nonatomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v0, 0x3ff, v0 ; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -190,12 +194,12 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i ; CHECK-LABEL: struct_atomic_buffer_load_i64: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: .LBB6_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc @@ -223,11 +227,12 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32 ; CHECK-LABEL: struct_atomic_buffer_load_v2i16: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB7_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc @@ -255,11 +260,12 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; CHECK-LABEL: struct_atomic_buffer_load_v4i16: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB8_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc @@ -291,11 +297,12 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32 ; CHECK-LABEL: struct_atomic_buffer_load_v4i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB9_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc @@ -323,11 +330,12 @@ define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %i ; CHECK-LABEL: struct_atomic_buffer_load_ptr: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB10_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll index c1f1782ea5a87f..a283f27fe065cb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -5,11 +5,7 @@ define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffs ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen offset:24 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[16:19], s20 idxen offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 24 @@ -22,11 +18,7 @@ define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset_ ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 idxen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -37,11 +29,7 @@ define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffs ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen slc +; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[16:19], s20 idxen offen slc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -52,11 +40,7 @@ define void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vof ; CHECK-LABEL: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s18 idxen offen offset:24 +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[16:19], s20 idxen offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 24 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll index 70296a0a7bec6a..d4955d1b01f667 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll @@ -6,11 +6,12 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %p ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB0_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc @@ -36,7 +37,7 @@ bb2: define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_const_idx(ptr addrspace(8) %ptr) { ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_const_idx: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB1_1: ; %bb1 @@ -66,11 +67,12 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_off: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB2_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc @@ -97,11 +99,12 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace( ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_soff: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB3_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc @@ -127,11 +130,12 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_dlc: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB4_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc @@ -158,10 +162,10 @@ define amdgpu_kernel void @struct_ptr_nonatomic_buffer_load_i32(ptr addrspace(8) ; CHECK-LABEL: struct_ptr_nonatomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v0, 0x3ff, v0 ; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -190,12 +194,12 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i64: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: .LBB6_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc @@ -223,11 +227,12 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) ; CHECK-LABEL: struct_ptr_atomic_buffer_load_v2i16: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB7_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc @@ -255,11 +260,12 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB8_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc @@ -291,11 +297,12 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) ; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB9_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc @@ -323,11 +330,12 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %p ; CHECK-LABEL: struct_ptr_atomic_buffer_load_ptr: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s4, s[2:3], 0x34 -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 +; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB10_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll index 2efade9fcbba17..2698ce1dc3fe3e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll @@ -10,7 +10,7 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsr ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -25,7 +25,7 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgp ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen +; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s16 idxen offen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %unused = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll index d5b5c71cc42a95..6e94d4fe9fa271 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll @@ -8,11 +8,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen +; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[16:19], s20 idxen offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -21,11 +17,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[16:19], s20 idxen offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -34,7 +26,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -45,7 +37,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -56,29 +48,21 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 idxen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 idxen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -89,7 +73,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void @@ -99,11 +83,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[16:19], s20 idxen offen slc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -112,11 +92,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen slc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[16:19], s20 idxen offen slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -125,7 +101,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -136,7 +112,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void @@ -146,11 +122,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX908-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s18 idxen offen +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[16:19], s20 idxen offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -159,11 +131,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[8:11], s18 idxen offen +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[16:19], s20 idxen offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -172,7 +140,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s6 idxen offen +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s16 idxen offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -183,7 +151,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen +; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s16 idxen offen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll index a312a3cb0a95cf..9d8572493b4563 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll @@ -9,11 +9,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen glc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[16:19], s20 idxen offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -22,7 +18,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -33,7 +29,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -45,18 +41,14 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffs ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen glc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 idxen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -67,7 +59,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffs ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -80,11 +72,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen glc slc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[16:19], s20 idxen offen glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -93,7 +81,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen sc0 nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen sc0 nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -104,7 +92,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -117,11 +105,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[8:11], s18 idxen offen glc +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[16:19], s20 idxen offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -130,7 +114,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s6 idxen offen sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s16 idxen offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -141,7 +125,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__ ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll index 8d1dce76d2cc8b..6da16f0a3b053c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll @@ -10,40 +10,28 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s16 idxen offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -54,7 +42,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -65,40 +53,28 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen offset:256 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -109,7 +85,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -121,40 +97,28 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffs ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen glc +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[16:19], s20 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen glc +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[16:19], s20 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[16:19], s20 idxen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s6 idxen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s16 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -165,7 +129,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -176,40 +140,28 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen glc slc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen glc slc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s16 idxen offen glc slc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -220,7 +172,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -231,39 +183,27 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s16 idxen offen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: @@ -273,7 +213,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -283,39 +223,27 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen offset:256 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: @@ -325,7 +253,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -337,39 +265,27 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[16:19], s20 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[16:19], s20 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[16:19], s20 idxen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s6 idxen +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s16 idxen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: @@ -379,7 +295,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s6 idxen +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void @@ -389,39 +305,27 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[16:19], s20 idxen offen slc ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen slc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s16 idxen offen slc ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: @@ -431,7 +335,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void @@ -442,7 +346,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v1 ; GFX6-NEXT: v_readfirstlane_b32 s9, v2 @@ -453,20 +357,20 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s16 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v1 ; GFX7-NEXT: v_readfirstlane_b32 s9, v2 @@ -477,13 +381,13 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s16 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -501,7 +405,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s16 idxen offen offset:256 glc ; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll index 4f9bac584a78e4..8254a86a3467a5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll @@ -9,22 +9,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -35,22 +27,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -62,22 +46,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], s20 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], s20 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -88,22 +64,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -114,22 +82,14 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -140,22 +100,14 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -168,22 +120,14 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], s20 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], s20 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -194,22 +138,14 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -221,7 +157,7 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v2 ; GFX6-NEXT: v_readfirstlane_b32 s9, v3 @@ -232,20 +168,20 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s16 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v2 ; GFX7-NEXT: v_readfirstlane_b32 s9, v3 @@ -256,13 +192,13 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s16 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll index 06b1a9cc70513e..aa41ef024d6e04 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll @@ -10,40 +10,28 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s16 idxen offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -54,7 +42,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -65,40 +53,28 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen offset:256 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -109,7 +85,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -121,40 +97,28 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voff ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen glc +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[16:19], s20 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen glc +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[16:19], s20 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[16:19], s20 idxen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s6 idxen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s16 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -165,7 +129,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -176,40 +140,28 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen glc slc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen glc slc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s16 idxen offen glc slc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -220,7 +172,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -231,39 +183,27 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s16 idxen offen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: @@ -273,7 +213,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -283,39 +223,27 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen offset:256 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: @@ -325,7 +253,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -337,39 +265,27 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[16:19], s20 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[16:19], s20 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[16:19], s20 idxen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s6 idxen +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s16 idxen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: @@ -379,7 +295,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s6 idxen +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void @@ -389,39 +305,27 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[16:19], s20 idxen offen slc ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen slc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s16 idxen offen slc ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: @@ -431,7 +335,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void @@ -442,7 +346,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v1 ; GFX6-NEXT: v_readfirstlane_b32 s9, v2 @@ -453,20 +357,20 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s16 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v1 ; GFX7-NEXT: v_readfirstlane_b32 s9, v2 @@ -477,13 +381,13 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s16 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -501,7 +405,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s16 idxen offen offset:256 glc ; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll index 01bc833d59be79..3934a1f9c02aad 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll @@ -9,22 +9,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -35,22 +27,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -62,22 +46,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], s20 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], s20 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -88,22 +64,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -114,22 +82,14 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -140,22 +100,14 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -168,22 +120,14 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], s20 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], s20 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -194,22 +138,14 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[16:19], s20 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -221,7 +157,7 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v2 ; GFX6-NEXT: v_readfirstlane_b32 s9, v3 @@ -232,20 +168,20 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s16 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[12:13] +; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v2 ; GFX7-NEXT: v_readfirstlane_b32 s9, v3 @@ -256,13 +192,13 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s16 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[12:13] +; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll index 58b422dd6a7510..8a281376965d1b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll @@ -6,9 +6,9 @@ define amdgpu_kernel void @buffer_store_format_d16_x(ptr addrspace(8) %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) { ; GCN-LABEL: buffer_store_format_d16_x: ; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_load_dword s4, s[6:7], 0x30 -; GCN-NEXT: s_load_dword s5, s[6:7], 0x54 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s4, s[8:9], 0x30 +; GCN-NEXT: s_load_dword s5, s[8:9], 0x54 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -22,8 +22,8 @@ main_body: define amdgpu_kernel void @buffer_store_format_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %index) { ; UNPACKED-LABEL: buffer_store_format_d16_xy: ; UNPACKED: ; %bb.0: ; %main_body -; UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 ; UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff @@ -35,8 +35,8 @@ define amdgpu_kernel void @buffer_store_format_d16_xy(ptr addrspace(8) %rsrc, <2 ; ; PACKED-LABEL: buffer_store_format_d16_xy: ; PACKED: ; %bb.0: ; %main_body -; PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PACKED-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-NEXT: v_mov_b32_e32 v0, s4 ; PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -50,9 +50,9 @@ main_body: define amdgpu_kernel void @buffer_store_format_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) { ; UNPACKED-LABEL: buffer_store_format_d16_xyz: ; UNPACKED: ; %bb.0: ; %main_body -; UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; UNPACKED-NEXT: s_load_dword s6, s[8:9], 0x18 ; UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff ; UNPACKED-NEXT: s_lshr_b32 s7, s4, 16 @@ -66,14 +66,14 @@ define amdgpu_kernel void @buffer_store_format_d16_xyz(ptr addrspace(8) %rsrc, < ; ; PACKED-LABEL: buffer_store_format_d16_xyz: ; PACKED: ; %bb.0: ; %main_body -; PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PACKED-NEXT: s_load_dword s6, s[8:9], 0x18 +; PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PACKED-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; PACKED-NEXT: v_mov_b32_e32 v0, s4 ; PACKED-NEXT: v_mov_b32_e32 v1, s5 -; PACKED-NEXT: v_mov_b32_e32 v2, s8 +; PACKED-NEXT: v_mov_b32_e32 v2, s6 ; PACKED-NEXT: buffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 idxen ; PACKED-NEXT: s_endpgm main_body: @@ -85,9 +85,9 @@ main_body: define amdgpu_kernel void @buffer_store_format_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) { ; UNPACKED-LABEL: buffer_store_format_d16_xyzw: ; UNPACKED: ; %bb.0: ; %main_body -; UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; UNPACKED-NEXT: s_load_dword s6, s[8:9], 0x18 ; UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; UNPACKED-NEXT: s_lshr_b32 s7, s5, 16 ; UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff @@ -103,13 +103,13 @@ define amdgpu_kernel void @buffer_store_format_d16_xyzw(ptr addrspace(8) %rsrc, ; ; PACKED-LABEL: buffer_store_format_d16_xyzw: ; PACKED: ; %bb.0: ; %main_body -; PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PACKED-NEXT: s_load_dword s6, s[8:9], 0x18 +; PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PACKED-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-NEXT: v_mov_b32_e32 v0, s4 ; PACKED-NEXT: v_mov_b32_e32 v1, s5 -; PACKED-NEXT: v_mov_b32_e32 v2, s8 +; PACKED-NEXT: v_mov_b32_e32 v2, s6 ; PACKED-NEXT: buffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 idxen ; PACKED-NEXT: s_endpgm main_body: @@ -120,9 +120,9 @@ main_body: define amdgpu_kernel void @buffer_store_format_i16_x(ptr addrspace(8) %rsrc, [8 x i32], i16 %data, [8 x i32], i32 %index) { ; GCN-LABEL: buffer_store_format_i16_x: ; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_load_dword s4, s[6:7], 0x30 -; GCN-NEXT: s_load_dword s5, s[6:7], 0x54 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s4, s[8:9], 0x30 +; GCN-NEXT: s_load_dword s5, s[8:9], 0x54 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll index d625dc17286a14..e5eae03bb5bde5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -18,8 +18,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -29,8 +29,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -40,11 +40,11 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -55,8 +55,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff @@ -68,8 +68,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -79,8 +79,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -90,11 +90,11 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -105,9 +105,9 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[8:9], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16 @@ -121,42 +121,42 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s6, s[8:9], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s6, s[8:9], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s8, s[4:5], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -168,9 +168,9 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[8:9], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s5, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff @@ -186,39 +186,39 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s6, s[8:9], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s6, s[8:9], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s8, s[4:5], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX11-PACKED-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index e4199e199feb14..17ebb1a835462d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -21,8 +21,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -32,8 +32,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -43,22 +43,22 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-LABEL: tbuffer_store_d16_x: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX12-PACKED-NEXT: s_endpgm main_body: @@ -69,8 +69,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff @@ -82,8 +82,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -93,8 +93,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -104,22 +104,22 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX12-PACKED-NEXT: s_endpgm main_body: @@ -130,9 +130,9 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[8:9], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16 @@ -146,68 +146,68 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s6, s[8:9], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s6, s[8:9], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s8, s[4:5], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-SDAG: ; %bb.0: ; %main_body ; GFX12-PACKED-SDAG-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x10 -; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-SDAG-NEXT: s_load_b96 s[8:10], s[4:5], 0x10 +; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff -; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s5 -; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-PACKED-SDAG-NEXT: s_and_b32 s4, s9, 0xffff +; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s8 +; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX12-PACKED-SDAG-NEXT: tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX12-PACKED-SDAG-NEXT: s_endpgm ; ; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body ; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 -; GFX12-PACKED-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x10 -; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-GISEL-NEXT: s_load_b96 s[8:10], s[4:5], 0x10 +; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4 -; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s8, s8, s8 +; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX12-PACKED-GISEL-NEXT: s_endpgm main_body: @@ -219,9 +219,9 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[8:9], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s5, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff @@ -237,51 +237,51 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s6, s[8:9], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s6, s[8:9], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s8, s[4:5], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX11-PACKED-NEXT: s_endpgm ; ; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b96 s[4:6], s[2:3], 0x10 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-NEXT: s_load_b96 s[8:10], s[4:5], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 -; GFX12-PACKED-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s8 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s9 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v2, s10 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen ; GFX12-PACKED-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index 279a64adfbda15..defaf70535cc5a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -36,7 +36,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_arg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -50,7 +50,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -69,7 +69,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_imm_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -83,7 +83,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_imm_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -102,7 +102,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_imm_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s8, 0x7b @@ -117,7 +117,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, ; ; VI-LABEL: bfe_u32_imm_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_movk_i32 s8, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -137,7 +137,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_0_width_reg_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: bfe_u32_arg_0_width_reg_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -162,7 +162,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_0_width_imm_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -172,7 +172,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, ; ; VI-LABEL: bfe_u32_arg_0_width_imm_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -187,7 +187,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zextload_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -204,7 +204,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: bfe_u32_zextload_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -229,7 +229,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -248,7 +248,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: bfe_u32_zext_in_reg_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -275,7 +275,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -294,7 +294,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: bfe_u32_zext_in_reg_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -321,7 +321,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -341,7 +341,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -369,7 +369,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -389,7 +389,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -417,7 +417,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -437,7 +437,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -465,7 +465,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -484,7 +484,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou ; ; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -511,7 +511,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -529,7 +529,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -553,7 +553,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -563,7 +563,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -580,7 +580,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -590,7 +590,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -607,7 +607,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -617,7 +617,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -635,7 +635,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_5: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -653,7 +653,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -679,7 +679,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -698,7 +698,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -724,7 +724,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -742,7 +742,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -767,7 +767,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -785,7 +785,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -810,7 +810,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_9: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -828,7 +828,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -852,7 +852,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -870,7 +870,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -894,7 +894,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_11: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -912,7 +912,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -936,7 +936,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_12: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -954,7 +954,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -979,7 +979,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_13: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -997,7 +997,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1021,7 +1021,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_14: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_14: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1047,7 +1047,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1057,7 +1057,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1097,7 +1097,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -1 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -1 @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_5: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1197,7 +1197,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x80 @@ -1207,7 +1207,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x80 @@ -1222,7 +1222,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1232,7 +1232,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1247,7 +1247,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1272,7 +1272,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_9: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1307,7 +1307,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1322,7 +1322,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_11: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 10 @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 @@ -1347,7 +1347,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_12: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1357,7 +1357,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_13: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1382,7 +1382,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_14: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 40 @@ -1407,7 +1407,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_14: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 40 @@ -1422,7 +1422,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_15: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 10 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_15: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 @@ -1447,7 +1447,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_17: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1482,7 +1482,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_17: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1497,7 +1497,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_18: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1507,7 +1507,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_18: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1526,10 +1526,10 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0, ; SI-LABEL: simplify_bfe_u32_multi_use_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1547,24 +1547,26 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0 ; ; VI-LABEL: simplify_bfe_u32_multi_use_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 63, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 { @@ -1579,11 +1581,11 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0 define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1591,12 +1593,12 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; VI-NEXT: s_bfe_u32 s4, s6, 0x30006 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1609,7 +1611,7 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: v_lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s2, s2, s3 @@ -1623,7 +1625,7 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 ; ; VI-LABEL: v_lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,11 +1645,11 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: and_lshr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1655,12 +1657,12 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: and_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; VI-NEXT: s_bfe_u32 s4, s6, 0x30006 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1673,11 +1675,11 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: and_lshr2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1685,12 +1687,12 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: and_lshr2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; VI-NEXT: s_bfe_u32 s4, s6, 0x30006 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1703,11 +1705,11 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: shl_lshr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x150002 +; SI-NEXT: s_bfe_u32 s4, s2, 0x150002 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1715,12 +1717,12 @@ define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: shl_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_u32 s4, s4, 0x150002 +; VI-NEXT: s_bfe_u32 s4, s6, 0x150002 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 837d484583d53f..cbc76a32a75e44 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -14,7 +14,7 @@ declare double @llvm.amdgcn.writelane.f64(double, i32, double) #0 define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1010-SDAG-LABEL: test_writelane_sreg_i32: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dword s4, s[0:1], 0x0 @@ -40,7 +40,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_i32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0 @@ -52,7 +52,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX802-GISEL-LABEL: test_writelane_sreg_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -66,7 +66,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1010-GISEL-LABEL: test_writelane_sreg_i32: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dword s4, s[0:1], 0x0 @@ -78,7 +78,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_i32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0 @@ -96,8 +96,8 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -114,39 +114,39 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX1010-SDAG-LABEL: test_writelane_sreg_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s2 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s2 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_sreg_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -163,33 +163,33 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX1010-GISEL-LABEL: test_writelane_sreg_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s2 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s2 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm %oldval = load i64, ptr addrspace(1) %out %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval) @@ -200,8 +200,8 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -218,39 +218,39 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX1010-SDAG-LABEL: test_writelane_sreg_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s2 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s2 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_sreg_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -267,33 +267,33 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX1010-GISEL-LABEL: test_writelane_sreg_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s2 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s2 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm %oldval = load double, ptr addrspace(1) %out %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval) @@ -304,8 +304,8 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -319,8 +319,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -333,8 +333,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -346,8 +346,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -361,8 +361,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -375,8 +375,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -394,8 +394,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -411,8 +411,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -427,8 +427,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -442,8 +442,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -459,8 +459,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -475,8 +475,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -496,8 +496,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -515,8 +515,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -532,8 +532,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -549,8 +549,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -568,8 +568,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -585,8 +585,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -607,7 +607,7 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -630,7 +630,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i32: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:4 @@ -647,7 +647,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -666,7 +666,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -690,7 +690,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i32: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: global_load_dword v0, v0, s[2:3] offset:4 @@ -707,7 +707,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -736,7 +736,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -761,7 +761,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -780,7 +780,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -801,7 +801,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -826,7 +826,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8 @@ -844,7 +844,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -875,7 +875,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -902,7 +902,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_f64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -922,7 +922,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_f64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -944,7 +944,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -971,7 +971,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_f64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8 @@ -990,7 +990,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_f64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -1023,8 +1023,8 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_m0_sreg_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX802-SDAG-NEXT: ;;#ASMSTART ; GFX802-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX802-SDAG-NEXT: ;;#ASMEND @@ -1043,8 +1043,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1010-SDAG-LABEL: test_writelane_m0_sreg_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: ;;#ASMSTART ; GFX1010-SDAG-NEXT: s_mov_b32 m0, -1 @@ -1060,8 +1060,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1100-SDAG-LABEL: test_writelane_m0_sreg_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: ;;#ASMSTART ; GFX1100-SDAG-NEXT: s_mov_b32 m0, -1 @@ -1076,8 +1076,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; ; GFX802-GISEL-LABEL: test_writelane_m0_sreg_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX802-GISEL-NEXT: ;;#ASMSTART ; GFX802-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX802-GISEL-NEXT: ;;#ASMEND @@ -1096,8 +1096,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1010-GISEL-LABEL: test_writelane_m0_sreg_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX1010-GISEL-NEXT: ;;#ASMSTART ; GFX1010-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX1010-GISEL-NEXT: ;;#ASMEND @@ -1113,8 +1113,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1100-GISEL-LABEL: test_writelane_m0_sreg_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1100-GISEL-NEXT: ;;#ASMSTART ; GFX1100-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX1100-GISEL-NEXT: ;;#ASMEND @@ -1136,8 +1136,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -1151,8 +1151,8 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1010-SDAG-LABEL: test_writelane_imm_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1165,8 +1165,8 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1100-SDAG-LABEL: test_writelane_imm_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -1178,8 +1178,8 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; ; GFX802-GISEL-LABEL: test_writelane_imm_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1193,8 +1193,8 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1010-GISEL-LABEL: test_writelane_imm_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1207,8 +1207,8 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1100-GISEL-LABEL: test_writelane_imm_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x0 @@ -1226,7 +1226,7 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %src0) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1241,7 +1241,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1010-SDAG-LABEL: test_writelane_imm_i64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1255,7 +1255,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1100-SDAG-LABEL: test_writelane_imm_i64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1269,7 +1269,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX802-GISEL-LABEL: test_writelane_imm_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1284,7 +1284,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1010-GISEL-LABEL: test_writelane_imm_i64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1298,7 +1298,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1100-GISEL-LABEL: test_writelane_imm_i64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1318,7 +1318,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double %src0) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1010-SDAG-LABEL: test_writelane_imm_f64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1347,7 +1347,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1100-SDAG-LABEL: test_writelane_imm_f64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1361,7 +1361,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX802-GISEL-LABEL: test_writelane_imm_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1376,7 +1376,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1010-GISEL-LABEL: test_writelane_imm_f64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1390,7 +1390,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1100-GISEL-LABEL: test_writelane_imm_f64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1410,8 +1410,8 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 @@ -1424,8 +1424,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 @@ -1436,19 +1436,19 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s6 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 @@ -1461,8 +1461,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -1473,11 +1473,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm @@ -1489,12 +1489,12 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_mov_b32 m0, s8 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1506,42 +1506,42 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x2 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s5, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s4, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s5, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s4, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x2 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x18 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x18 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s1, s2 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s4 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX802-GISEL-NEXT: s_mov_b32 m0, s8 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 @@ -1552,31 +1552,31 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x2 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s4, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s5, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s4, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s5, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x2 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x18 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x18 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s1, s2 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s4 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s4 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX1100-GISEL-NEXT: s_endpgm %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval) store i64 %writelane, ptr addrspace(1) %out, align 4 @@ -1586,12 +1586,12 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_mov_b32 m0, s8 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1603,42 +1603,42 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x2 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s5, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s4, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s5, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s4, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x2 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x18 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x18 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s1, s2 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s4 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX802-GISEL-NEXT: s_mov_b32 m0, s8 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 @@ -1649,31 +1649,31 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x2 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s4, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s5, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s4, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s5, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x2 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x18 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x18 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s1, s2 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s4 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s4 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX1100-GISEL-NEXT: s_endpgm %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval) store double %writelane, ptr addrspace(1) %out, align 4 @@ -1683,7 +1683,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 @@ -1695,7 +1695,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1705,7 +1705,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1715,7 +1715,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 @@ -1727,7 +1727,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1737,7 +1737,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1752,8 +1752,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1768,8 +1768,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -1782,21 +1782,21 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s0 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s0 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s4 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1811,8 +1811,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -1825,15 +1825,15 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s0 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s0 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s4 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s4 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 42) store i64 %writelane, ptr addrspace(1) %out, align 4 @@ -1843,8 +1843,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1859,8 +1859,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -1873,21 +1873,21 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s7, s0 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s6, s0 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s4 +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1902,8 +1902,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -1916,15 +1916,15 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s6, s0 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s7, s0 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s4 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s4 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double 42.0) store double %writelane, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll index 42a59ec7bccb45..6b57f20f25e2ce 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -10,7 +10,7 @@ declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a) define amdgpu_kernel void @ceil_f16( ; SI-LABEL: ceil_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @ceil_f16( ; ; VI-LABEL: ceil_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -48,7 +48,7 @@ define amdgpu_kernel void @ceil_f16( ; ; GFX11-LABEL: ceil_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -66,7 +66,7 @@ define amdgpu_kernel void @ceil_f16( ; ; GFX11-FAKE16-LABEL: ceil_f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -100,7 +100,7 @@ entry: define amdgpu_kernel void @ceil_v2f16( ; SI-LABEL: ceil_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -126,7 +126,7 @@ define amdgpu_kernel void @ceil_v2f16( ; ; VI-LABEL: ceil_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -146,7 +146,7 @@ define amdgpu_kernel void @ceil_v2f16( ; ; GFX11-LABEL: ceil_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -171,7 +171,7 @@ define amdgpu_kernel void @ceil_v2f16( ; ; GFX11-FAKE16-LABEL: ceil_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 62bdef89b5a0bc..3ff759a5cdb94f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: cos_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: cos_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -46,31 +46,31 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: cos_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 ; GFX9-NEXT: v_cos_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cos_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 ; GFX10-NEXT: v_cos_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: cos_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -89,7 +89,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: cos_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -119,7 +119,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: cos_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -140,39 +140,39 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: cos_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 ; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_cos_f16_e32 v2, v3 ; GFX9-NEXT: v_cos_f16_e32 v1, v1 ; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cos_v2f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 ; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_cos_f16_e32 v2, v3 ; GFX10-NEXT: v_cos_f16_e32 v1, v1 ; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: cos_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll index 6c8fccd54b81bd..235d8dde966586 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}test_debug_value: ; NOOPT: .loc 1 1 42 prologue_end ; /tmp/test_debug_value.cl:1:42 -; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; NOOPT-NEXT: .Ltmp ; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- $sgpr4_sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 4bed23487445a6..bbade6e7469f7e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -12,12 +12,12 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-LABEL: s_exp_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 @@ -30,14 +30,14 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -47,13 +47,13 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: s_and_b32 s0, s2, 0xfffff000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 @@ -65,14 +65,14 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -82,16 +82,16 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 @@ -99,36 +99,36 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -136,17 +136,17 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; SI-SDAG-LABEL: s_exp_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 +; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; SI-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 @@ -154,39 +154,39 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v1 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -338,7 +338,7 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 @@ -390,7 +390,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -442,59 +442,59 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s6, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s2, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v6 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v6 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v6, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v8, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v6 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s7, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v1, v3 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s3, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v1, v3 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v0, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v0, -v5 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v1, v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v1, v5 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 @@ -504,23 +504,23 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v5, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -562,7 +562,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-GISEL-LABEL: s_exp_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -853,51 +853,51 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, s6, v2 +; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s3 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, s2, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s3, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s3, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 -; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7 +; VI-SDAG-NEXT: s_and_b32 s3, s1, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v7, s3 +; VI-SDAG-NEXT: v_sub_f32_e32 v7, s1, v7 ; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x39a3b295, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3fb8a000, v7 ; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, s3, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6 ; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7 ; VI-SDAG-NEXT: v_exp_f32_e32 v7, v2 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v6 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2ce8ed0 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5 -; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v5 +; VI-SDAG-NEXT: s_and_b32 s2, s0, 0xfffff000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v1, v7, v6 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 -; VI-SDAG-NEXT: v_sub_f32_e32 v7, s4, v7 +; VI-SDAG-NEXT: v_sub_f32_e32 v7, s0, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3fb8a000, v7 @@ -909,308 +909,306 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v6 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v5 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295 +; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: s_and_b32 s3, s0, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; VI-GISEL-NEXT: v_sub_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s3, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v2 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2 -; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s3, v2 +; VI-GISEL-NEXT: s_and_b32 s3, s1, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 +; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v3 +; VI-GISEL-NEXT: v_sub_f32_e32 v5, s1, v5 +; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v5 ; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3fb8a000, v5 -; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v1 +; VI-GISEL-NEXT: v_add_f32_e32 v0, v3, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v6, s3, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, s3, v2 +; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v5, v7, v5 ; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7 ; VI-GISEL-NEXT: v_add_f32_e32 v5, v6, v5 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7 ; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 -; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v3 -; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 -; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc2ce8ed0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v3 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42b17218 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v4 +; VI-GISEL-NEXT: s_and_b32 s0, s2, 0xfffff000 ; VI-GISEL-NEXT: v_ldexp_f32 v5, v5, v6 -; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v3, v0 -; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 -; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 -; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0 +; VI-GISEL-NEXT: v_sub_f32_e32 v6, s2, v6 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x39a3b295, v6 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v6 ; VI-GISEL-NEXT: v_rndne_f32_e32 v6, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6 -; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v3 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc2ce8ed0 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v3 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42b17218 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v2, v6 ; VI-GISEL-NEXT: v_exp_f32_e32 v6, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s5, v4 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s1, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; VI-GISEL-NEXT: v_ldexp_f32 v2, v6, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_exp_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s1, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v6, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v6, s5, v0, -v6 -; GFX900-SDAG-NEXT: v_fma_f32 v6, s5, v1, v6 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v6, s1, v0, -v6 +; GFX900-SDAG-NEXT: v_fma_f32 v6, s1, v1, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s6, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s2, v0, -v2 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v6, v6 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v7 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s6, v1, v4 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s2, v1, v4 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_ldexp_f32 v6, v6, v7 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s0, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v9, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v7 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s0, v0, -v7 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v10, v7, v9 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s0, v1, v0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2ce8ed0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v9 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v5 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v7 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 -; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s1, v1 +; GFX900-GISEL-NEXT: v_fma_f32 v6, s1, v1, -v5 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v7, v5 -; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v2, v6 +; GFX900-GISEL-NEXT: v_fma_f32 v6, s1, v2, v6 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v5, v5, v7 ; GFX900-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v5 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v1 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s4, v1, -v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v1 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s0, v1, -v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s4, v2, v3 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s0, v2, v3 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX900-GISEL-NEXT: v_ldexp_f32 v5, v5, v6 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s6, v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s2, v1 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v1, s6, v1, -v6 -; GFX900-GISEL-NEXT: v_fma_f32 v1, s6, v2, v1 +; GFX900-GISEL-NEXT: v_fma_f32 v1, s2, v1, -v6 +; GFX900-GISEL-NEXT: v_fma_f32 v1, s2, v2, v1 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v2, v6 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v6, v1 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v6, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s5, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s1, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v6, v2 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v5 ; SI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6 -; SI-SDAG-NEXT: v_fma_f32 v5, s4, v0, -v5 -; SI-SDAG-NEXT: v_fma_f32 v5, s4, v2, v5 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s5, v0 +; SI-SDAG-NEXT: v_fma_f32 v5, s0, v0, -v5 +; SI-SDAG-NEXT: v_fma_f32 v5, s0, v2, v5 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s1, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; SI-SDAG-NEXT: v_fma_f32 v4, s5, v0, -v1 +; SI-SDAG-NEXT: v_fma_f32 v4, s1, v0, -v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v5, v5 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 -; SI-SDAG-NEXT: v_fma_f32 v4, s5, v2, v4 +; SI-SDAG-NEXT: v_fma_f32 v4, s1, v2, v4 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v5, v5, v6 -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s2, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v8, v6 -; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v6 +; SI-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v6 ; SI-SDAG-NEXT: v_sub_f32_e32 v9, v6, v8 -; SI-SDAG-NEXT: v_fma_f32 v0, s6, v2, v0 +; SI-SDAG-NEXT: v_fma_f32 v0, s2, v2, v0 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2ce8ed0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v9, v0 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x42b17218 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v8 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v4 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v3 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v4 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v6 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v3 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v4 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 -; SI-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, s1, v1 +; SI-GISEL-NEXT: v_fma_f32 v6, s1, v1, -v5 ; SI-GISEL-NEXT: v_rndne_f32_e32 v7, v5 -; SI-GISEL-NEXT: v_fma_f32 v6, s5, v2, v6 +; SI-GISEL-NEXT: v_fma_f32 v6, s1, v2, v6 ; SI-GISEL-NEXT: v_sub_f32_e32 v5, v5, v7 ; SI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7 ; SI-GISEL-NEXT: v_exp_f32_e32 v5, v5 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v1 -; SI-GISEL-NEXT: v_fma_f32 v3, s4, v1, -v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v1 +; SI-GISEL-NEXT: v_fma_f32 v3, s0, v1, -v0 ; SI-GISEL-NEXT: v_rndne_f32_e32 v4, v0 -; SI-GISEL-NEXT: v_fma_f32 v3, s4, v2, v3 +; SI-GISEL-NEXT: v_fma_f32 v3, s0, v2, v3 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v5, v5, v6 -; SI-GISEL-NEXT: v_mul_f32_e32 v6, s6, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v1 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_fma_f32 v1, s6, v1, -v6 -; SI-GISEL-NEXT: v_fma_f32 v1, s6, v2, v1 +; SI-GISEL-NEXT: v_fma_f32 v1, s2, v1, -v6 +; SI-GISEL-NEXT: v_fma_f32 v1, s2, v2, v1 ; SI-GISEL-NEXT: v_rndne_f32_e32 v2, v6 ; SI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; SI-GISEL-NEXT: v_add_f32_e32 v1, v6, v1 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v3 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v4 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v6, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; SI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v3 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s5, v3 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s1, v3 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, v6, v2 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SI-GISEL-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 ; SI-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp_v3f32: @@ -1594,70 +1592,70 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 +; VI-SDAG-NEXT: s_and_b32 s6, s3, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s6 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s6, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s6, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 -; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7 +; VI-SDAG-NEXT: s_and_b32 s6, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; VI-SDAG-NEXT: v_sub_f32_e32 v7, s2, v7 ; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x39a3b295, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3fb8a000, v7 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, s6, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 ; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 -; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 +; VI-SDAG-NEXT: s_and_b32 s3, s1, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v9, s3 ; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v6 -; VI-SDAG-NEXT: v_sub_f32_e32 v9, s5, v9 +; VI-SDAG-NEXT: v_sub_f32_e32 v9, s1, v9 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v8, v1, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x39a3b295, v9 ; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3fb8a000, v9 ; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10 -; VI-SDAG-NEXT: v_mul_f32_e32 v10, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v10, s3, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7 ; VI-SDAG-NEXT: v_add_f32_e32 v9, v10, v9 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v9 ; VI-SDAG-NEXT: v_exp_f32_e32 v9, v2 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v7 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6 -; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 +; VI-SDAG-NEXT: s_and_b32 s2, s0, 0xfffff000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v1, v9, v7 ; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2 -; VI-SDAG-NEXT: v_sub_f32_e32 v9, s4, v9 +; VI-SDAG-NEXT: v_sub_f32_e32 v9, s0, v9 ; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x39a3b295, v9 ; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3fb8a000, v9 @@ -1669,377 +1667,376 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v5 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v6 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x39a3b295 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: s_and_b32 s6, s0, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; VI-GISEL-NEXT: v_sub_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s6, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 -; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 +; VI-GISEL-NEXT: s_and_b32 s6, s1, 0xfffff000 +; VI-GISEL-NEXT: v_mul_f32_e32 v6, s6, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; VI-GISEL-NEXT: v_sub_f32_e32 v1, s1, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v7 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, s6, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v7, v1 ; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v6, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7 ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 -; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v4 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v5 +; VI-GISEL-NEXT: s_and_b32 s0, s2, 0xfffff000 ; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 -; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 -; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 +; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0 +; VI-GISEL-NEXT: v_sub_f32_e32 v6, s2, v6 ; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v6 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v8, s0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v9 -; VI-GISEL-NEXT: v_mul_f32_e32 v9, s2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v9, s0, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v9, v6 ; VI-GISEL-NEXT: v_rndne_f32_e32 v9, v8 ; VI-GISEL-NEXT: v_sub_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v8, v6 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v8, v9 ; VI-GISEL-NEXT: v_exp_f32_e32 v6, v6 -; VI-GISEL-NEXT: s_and_b32 s2, s7, 0xfffff000 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v3 +; VI-GISEL-NEXT: s_and_b32 s0, s3, 0xfffff000 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v3 ; VI-GISEL-NEXT: v_ldexp_f32 v6, v6, v8 -; VI-GISEL-NEXT: v_mov_b32_e32 v8, s2 -; VI-GISEL-NEXT: v_sub_f32_e32 v8, s7, v8 +; VI-GISEL-NEXT: v_mov_b32_e32 v8, s0 +; VI-GISEL-NEXT: v_sub_f32_e32 v8, s3, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3fb8a000, v8 ; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8 ; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v5 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v4 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v8 ; VI-GISEL-NEXT: v_exp_f32_e32 v8, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s5, v5 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s1, v5 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v5 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v5 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; VI-GISEL-NEXT: v_ldexp_f32 v3, v8, v3 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v5 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_exp_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s2, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v3, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v3, s6, v0, -v3 -; GFX900-SDAG-NEXT: v_fma_f32 v3, s6, v1, v3 +; GFX900-SDAG-NEXT: v_fma_f32 v3, s2, v0, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v3, s2, v1, v3 ; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v8, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v8, v3 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v7 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v6 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v9, v2, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v8, v7 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s5, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s1, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v8, v7 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v10, v7, v8 -; GFX900-SDAG-NEXT: v_fma_f32 v7, s5, v0, -v7 -; GFX900-SDAG-NEXT: v_fma_f32 v7, s5, v1, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v7, s1, v0, -v7 +; GFX900-SDAG-NEXT: v_fma_f32 v7, s1, v1, v7 ; GFX900-SDAG-NEXT: v_add_f32_e32 v7, v10, v7 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v7, v7 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v8, v8 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 ; GFX900-SDAG-NEXT: v_ldexp_f32 v7, v7, v8 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v8, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v8, s0, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v10, v8 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v8 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s0, v0, -v8 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v11, v8, v10 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s0, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v11, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v8, v10 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v6 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v8 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX900-GISEL-NEXT: v_fma_f32 v1, s0, v2, -v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v3, v1 +; GFX900-GISEL-NEXT: v_fma_f32 v1, s0, v3, v1 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v4 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s5, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v2, -v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s1, v2 +; GFX900-GISEL-NEXT: v_fma_f32 v6, s1, v2, -v1 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v7, v1 -; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v3, v6 +; GFX900-GISEL-NEXT: v_fma_f32 v6, s1, v3, v6 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v7 ; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v6 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v5 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v5 ; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s6, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v8, s6, v2, -v6 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 +; GFX900-GISEL-NEXT: v_fma_f32 v8, s2, v2, -v6 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v9, v6 -; GFX900-GISEL-NEXT: v_fma_f32 v8, s6, v3, v8 +; GFX900-GISEL-NEXT: v_fma_f32 v8, s2, v3, v8 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v6, v6, v9 ; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v8 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v8, v9 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v6, v6 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v6, v6, v8 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, s7, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v2, s7, v2, -v8 -; GFX900-GISEL-NEXT: v_fma_f32 v2, s7, v3, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, s3, v2 +; GFX900-GISEL-NEXT: v_fma_f32 v2, s3, v2, -v8 +; GFX900-GISEL-NEXT: v_fma_f32 v2, s3, v3, v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v8 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v8, v8, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v8, v2 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v8, v2 -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s5, v5 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s1, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v5 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v3, v8, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; SI-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 +; SI-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; SI-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v4 +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, s2, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v3 ; SI-SDAG-NEXT: v_sub_f32_e32 v7, v3, v6 -; SI-SDAG-NEXT: v_fma_f32 v3, s6, v0, -v3 -; SI-SDAG-NEXT: v_fma_f32 v3, s6, v1, v3 +; SI-SDAG-NEXT: v_fma_f32 v3, s2, v0, -v3 +; SI-SDAG-NEXT: v_fma_f32 v3, s2, v1, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v3, v7, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v7, v3 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v5 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v5 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v8, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v7, v6 -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s1, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v7, v6 ; SI-SDAG-NEXT: v_sub_f32_e32 v9, v6, v7 -; SI-SDAG-NEXT: v_fma_f32 v6, s5, v0, -v6 -; SI-SDAG-NEXT: v_fma_f32 v6, s5, v1, v6 +; SI-SDAG-NEXT: v_fma_f32 v6, s1, v0, -v6 +; SI-SDAG-NEXT: v_fma_f32 v6, s1, v1, v6 ; SI-SDAG-NEXT: v_add_f32_e32 v6, v9, v6 ; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v7 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v4 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v5 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v6, v6, v7 -; SI-SDAG-NEXT: v_mul_f32_e32 v7, s4, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v7, s0, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v9, v7 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v7 +; SI-SDAG-NEXT: v_fma_f32 v0, s0, v0, -v7 ; SI-SDAG-NEXT: v_sub_f32_e32 v10, v7, v9 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; SI-SDAG-NEXT: v_fma_f32 v0, s0, v1, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v10, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v9 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v4 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v5 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v5 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v7 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v4 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v5 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f ; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 -; SI-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v2 +; SI-GISEL-NEXT: v_fma_f32 v1, s0, v2, -v0 ; SI-GISEL-NEXT: v_rndne_f32_e32 v4, v0 -; SI-GISEL-NEXT: v_fma_f32 v1, s4, v3, v1 +; SI-GISEL-NEXT: v_fma_f32 v1, s0, v3, v1 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v4 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s5, v2 -; SI-GISEL-NEXT: v_fma_f32 v6, s5, v2, -v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, s1, v2 +; SI-GISEL-NEXT: v_fma_f32 v6, s1, v2, -v1 ; SI-GISEL-NEXT: v_rndne_f32_e32 v7, v1 -; SI-GISEL-NEXT: v_fma_f32 v6, s5, v3, v6 +; SI-GISEL-NEXT: v_fma_f32 v6, s1, v3, v6 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v7 ; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v6 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; SI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v5 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v6 -; SI-GISEL-NEXT: v_mul_f32_e32 v6, s6, v2 -; SI-GISEL-NEXT: v_fma_f32 v8, s6, v2, -v6 +; SI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 +; SI-GISEL-NEXT: v_fma_f32 v8, s2, v2, -v6 ; SI-GISEL-NEXT: v_rndne_f32_e32 v9, v6 -; SI-GISEL-NEXT: v_fma_f32 v8, s6, v3, v8 +; SI-GISEL-NEXT: v_fma_f32 v8, s2, v3, v8 ; SI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v9 ; SI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v8, v9 ; SI-GISEL-NEXT: v_exp_f32_e32 v6, v6 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; SI-GISEL-NEXT: v_ldexp_f32_e32 v6, v6, v8 -; SI-GISEL-NEXT: v_mul_f32_e32 v8, s7, v2 -; SI-GISEL-NEXT: v_fma_f32 v2, s7, v2, -v8 -; SI-GISEL-NEXT: v_fma_f32 v2, s7, v3, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v8, s3, v2 +; SI-GISEL-NEXT: v_fma_f32 v2, s3, v2, -v8 +; SI-GISEL-NEXT: v_fma_f32 v2, s3, v3, v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v8 ; SI-GISEL-NEXT: v_sub_f32_e32 v8, v8, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v2, v8, v2 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v8, v2 -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s5, v5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s1, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; SI-GISEL-NEXT: v_ldexp_f32_e32 v3, v8, v3 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index ec7e52532cd327..81bb556b8c87bc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -14,12 +14,12 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-LABEL: s_exp10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 @@ -32,14 +32,14 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -49,13 +49,13 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp10_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 +; VI-GISEL-NEXT: s_and_b32 s0, s2, 0xfffff000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 @@ -67,14 +67,14 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -84,16 +84,16 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 @@ -101,36 +101,36 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -138,17 +138,17 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-SDAG-LABEL: s_exp10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 +; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; SI-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 @@ -156,39 +156,39 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v1 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp10_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -340,7 +340,7 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 @@ -392,7 +392,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -444,59 +444,59 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s6, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s2, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v6 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v6 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v6, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v8, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x421a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v6 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s7, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v1, v3 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s3, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v1, v3 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v0, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v0, -v5 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v1, v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v1, v5 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 @@ -506,23 +506,23 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x421a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v5, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -564,7 +564,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_exp10_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -855,51 +855,51 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, s6, v2 +; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s3 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, s2, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s3, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s3, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 -; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7 +; VI-SDAG-NEXT: s_and_b32 s3, s1, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v7, s3 +; VI-SDAG-NEXT: v_sub_f32_e32 v7, s1, v7 ; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3a2784bc, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x40549000, v7 ; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, s3, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6 ; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7 ; VI-SDAG-NEXT: v_exp_f32_e32 v7, v2 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v6 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc23369f4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5 -; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v5 +; VI-SDAG-NEXT: s_and_b32 s2, s0, 0xfffff000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v1, v7, v6 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 -; VI-SDAG-NEXT: v_sub_f32_e32 v7, s4, v7 +; VI-SDAG-NEXT: v_sub_f32_e32 v7, s0, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x40549000, v7 @@ -911,308 +911,306 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v6 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v5 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp10_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc +; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: s_and_b32 s3, s0, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; VI-GISEL-NEXT: v_sub_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s3, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v2 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2 -; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s3, v2 +; VI-GISEL-NEXT: s_and_b32 s3, s1, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 +; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v3 +; VI-GISEL-NEXT: v_sub_f32_e32 v5, s1, v5 +; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v5 ; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x40549000, v5 -; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v1 +; VI-GISEL-NEXT: v_add_f32_e32 v0, v3, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v6, s3, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, s3, v2 +; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v5, v7, v5 ; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7 ; VI-GISEL-NEXT: v_add_f32_e32 v5, v6, v5 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7 ; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 -; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v3 -; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 -; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc23369f4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v3 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x421a209b +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v4 +; VI-GISEL-NEXT: s_and_b32 s0, s2, 0xfffff000 ; VI-GISEL-NEXT: v_ldexp_f32 v5, v5, v6 -; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v3, v0 -; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 -; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 -; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0 +; VI-GISEL-NEXT: v_sub_f32_e32 v6, s2, v6 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3a2784bc, v6 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x40549000, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v6 ; VI-GISEL-NEXT: v_rndne_f32_e32 v6, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6 -; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v3 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc23369f4 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v3 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x421a209b ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v2, v6 ; VI-GISEL-NEXT: v_exp_f32_e32 v6, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s5, v4 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s1, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; VI-GISEL-NEXT: v_ldexp_f32 v2, v6, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_exp10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s1, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v6, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v6, s5, v0, -v6 -; GFX900-SDAG-NEXT: v_fma_f32 v6, s5, v1, v6 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v6, s1, v0, -v6 +; GFX900-SDAG-NEXT: v_fma_f32 v6, s1, v1, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s6, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s2, v0, -v2 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v6, v6 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v7 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s6, v1, v4 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s2, v1, v4 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_ldexp_f32 v6, v6, v7 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s0, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v9, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v7 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s0, v0, -v7 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v10, v7, v9 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s0, v1, v0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0xc23369f4 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v9 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v5 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v7 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 -; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s1, v1 +; GFX900-GISEL-NEXT: v_fma_f32 v6, s1, v1, -v5 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v7, v5 -; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v2, v6 +; GFX900-GISEL-NEXT: v_fma_f32 v6, s1, v2, v6 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v5, v5, v7 ; GFX900-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v5 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v1 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s4, v1, -v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v1 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s0, v1, -v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s4, v2, v3 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s0, v2, v3 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX900-GISEL-NEXT: v_ldexp_f32 v5, v5, v6 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s6, v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s2, v1 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v1, s6, v1, -v6 -; GFX900-GISEL-NEXT: v_fma_f32 v1, s6, v2, v1 +; GFX900-GISEL-NEXT: v_fma_f32 v1, s2, v1, -v6 +; GFX900-GISEL-NEXT: v_fma_f32 v1, s2, v2, v1 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v2, v6 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v6, v6, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v6, v1 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x421a209b ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v6, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s5, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s1, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v6, v2 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v5 ; SI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6 -; SI-SDAG-NEXT: v_fma_f32 v5, s4, v0, -v5 -; SI-SDAG-NEXT: v_fma_f32 v5, s4, v2, v5 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s5, v0 +; SI-SDAG-NEXT: v_fma_f32 v5, s0, v0, -v5 +; SI-SDAG-NEXT: v_fma_f32 v5, s0, v2, v5 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s1, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; SI-SDAG-NEXT: v_fma_f32 v4, s5, v0, -v1 +; SI-SDAG-NEXT: v_fma_f32 v4, s1, v0, -v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v5, v5 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 -; SI-SDAG-NEXT: v_fma_f32 v4, s5, v2, v4 +; SI-SDAG-NEXT: v_fma_f32 v4, s1, v2, v4 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v5, v5, v6 -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s2, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v8, v6 -; SI-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v6 +; SI-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v6 ; SI-SDAG-NEXT: v_sub_f32_e32 v9, v6, v8 -; SI-SDAG-NEXT: v_fma_f32 v0, s6, v2, v0 +; SI-SDAG-NEXT: v_fma_f32 v0, s2, v2, v0 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0xc23369f4 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v9, v0 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x421a209b ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v8 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v4 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v3 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v4 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v6 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v3 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v3 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v4 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp10_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 -; SI-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, s1, v1 +; SI-GISEL-NEXT: v_fma_f32 v6, s1, v1, -v5 ; SI-GISEL-NEXT: v_rndne_f32_e32 v7, v5 -; SI-GISEL-NEXT: v_fma_f32 v6, s5, v2, v6 +; SI-GISEL-NEXT: v_fma_f32 v6, s1, v2, v6 ; SI-GISEL-NEXT: v_sub_f32_e32 v5, v5, v7 ; SI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7 ; SI-GISEL-NEXT: v_exp_f32_e32 v5, v5 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v1 -; SI-GISEL-NEXT: v_fma_f32 v3, s4, v1, -v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v1 +; SI-GISEL-NEXT: v_fma_f32 v3, s0, v1, -v0 ; SI-GISEL-NEXT: v_rndne_f32_e32 v4, v0 -; SI-GISEL-NEXT: v_fma_f32 v3, s4, v2, v3 +; SI-GISEL-NEXT: v_fma_f32 v3, s0, v2, v3 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v5, v5, v6 -; SI-GISEL-NEXT: v_mul_f32_e32 v6, s6, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v1 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_fma_f32 v1, s6, v1, -v6 -; SI-GISEL-NEXT: v_fma_f32 v1, s6, v2, v1 +; SI-GISEL-NEXT: v_fma_f32 v1, s2, v1, -v6 +; SI-GISEL-NEXT: v_fma_f32 v1, s2, v2, v1 ; SI-GISEL-NEXT: v_rndne_f32_e32 v2, v6 ; SI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; SI-GISEL-NEXT: v_add_f32_e32 v1, v6, v1 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v3 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v4 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x421a209b ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v6, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; SI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v3 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s5, v3 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s1, v3 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, v6, v2 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SI-GISEL-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 ; SI-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp10_v3f32: @@ -1596,70 +1594,70 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 +; VI-SDAG-NEXT: s_and_b32 s6, s3, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s6 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s6, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s6, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 -; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7 +; VI-SDAG-NEXT: s_and_b32 s6, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; VI-SDAG-NEXT: v_sub_f32_e32 v7, s2, v7 ; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3a2784bc, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x40549000, v7 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, s6, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 ; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 -; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 +; VI-SDAG-NEXT: s_and_b32 s3, s1, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v9, s3 ; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v6 -; VI-SDAG-NEXT: v_sub_f32_e32 v9, s5, v9 +; VI-SDAG-NEXT: v_sub_f32_e32 v9, s1, v9 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v8, v1, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x3a2784bc, v9 ; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x40549000, v9 ; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10 -; VI-SDAG-NEXT: v_mul_f32_e32 v10, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v10, s3, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7 ; VI-SDAG-NEXT: v_add_f32_e32 v9, v10, v9 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v9 ; VI-SDAG-NEXT: v_exp_f32_e32 v9, v2 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v7 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6 -; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 +; VI-SDAG-NEXT: s_and_b32 s2, s0, 0xfffff000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v1, v9, v7 ; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2 -; VI-SDAG-NEXT: v_sub_f32_e32 v9, s4, v9 +; VI-SDAG-NEXT: v_sub_f32_e32 v9, s0, v9 ; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x3a2784bc, v9 ; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x40549000, v9 @@ -1671,377 +1669,376 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v5 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v6 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp10_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3a2784bc ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b +; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: s_and_b32 s6, s0, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; VI-GISEL-NEXT: v_sub_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s6, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 -; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 +; VI-GISEL-NEXT: s_and_b32 s6, s1, 0xfffff000 +; VI-GISEL-NEXT: v_mul_f32_e32 v6, s6, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; VI-GISEL-NEXT: v_sub_f32_e32 v1, s1, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549000, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v7 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, s6, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v7, v1 ; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v6, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7 ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 -; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v4 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v5 +; VI-GISEL-NEXT: s_and_b32 s0, s2, 0xfffff000 ; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 -; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 -; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 +; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0 +; VI-GISEL-NEXT: v_sub_f32_e32 v6, s2, v6 ; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v6 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x40549000, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v8, s0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v9 -; VI-GISEL-NEXT: v_mul_f32_e32 v9, s2, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v9, s0, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v9, v6 ; VI-GISEL-NEXT: v_rndne_f32_e32 v9, v8 ; VI-GISEL-NEXT: v_sub_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v8, v6 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v8, v9 ; VI-GISEL-NEXT: v_exp_f32_e32 v6, v6 -; VI-GISEL-NEXT: s_and_b32 s2, s7, 0xfffff000 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v3 +; VI-GISEL-NEXT: s_and_b32 s0, s3, 0xfffff000 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v3 ; VI-GISEL-NEXT: v_ldexp_f32 v6, v6, v8 -; VI-GISEL-NEXT: v_mov_b32_e32 v8, s2 -; VI-GISEL-NEXT: v_sub_f32_e32 v8, s7, v8 +; VI-GISEL-NEXT: v_mov_b32_e32 v8, s0 +; VI-GISEL-NEXT: v_sub_f32_e32 v8, s3, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x40549000, v8 ; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8 ; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v5 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v4 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v8 ; VI-GISEL-NEXT: v_exp_f32_e32 v8, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s5, v5 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s1, v5 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v5 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v5 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; VI-GISEL-NEXT: v_ldexp_f32 v3, v8, v3 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v5 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_exp10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s2, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v3, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v3, s6, v0, -v3 -; GFX900-SDAG-NEXT: v_fma_f32 v3, s6, v1, v3 +; GFX900-SDAG-NEXT: v_fma_f32 v3, s2, v0, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v3, s2, v1, v3 ; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v8, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v8, v3 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v7 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v6 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v9, v2, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v8, v7 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s5, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s1, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v8, v7 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v10, v7, v8 -; GFX900-SDAG-NEXT: v_fma_f32 v7, s5, v0, -v7 -; GFX900-SDAG-NEXT: v_fma_f32 v7, s5, v1, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v7, s1, v0, -v7 +; GFX900-SDAG-NEXT: v_fma_f32 v7, s1, v1, v7 ; GFX900-SDAG-NEXT: v_add_f32_e32 v7, v10, v7 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v7, v7 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v8, v8 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 ; GFX900-SDAG-NEXT: v_ldexp_f32 v7, v7, v8 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v8, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v8, s0, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v10, v8 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v8 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s0, v0, -v8 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v11, v8, v10 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s0, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v11, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v8, v10 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v6 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v6 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v8 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX900-GISEL-NEXT: v_fma_f32 v1, s0, v2, -v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v3, v1 +; GFX900-GISEL-NEXT: v_fma_f32 v1, s0, v3, v1 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v4 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s5, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v2, -v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s1, v2 +; GFX900-GISEL-NEXT: v_fma_f32 v6, s1, v2, -v1 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v7, v1 -; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v3, v6 +; GFX900-GISEL-NEXT: v_fma_f32 v6, s1, v3, v6 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v7 ; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v6 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v5 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v5 ; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s6, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v8, s6, v2, -v6 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 +; GFX900-GISEL-NEXT: v_fma_f32 v8, s2, v2, -v6 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v9, v6 -; GFX900-GISEL-NEXT: v_fma_f32 v8, s6, v3, v8 +; GFX900-GISEL-NEXT: v_fma_f32 v8, s2, v3, v8 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v6, v6, v9 ; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v8 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v8, v9 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v6, v6 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v6, v6, v8 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, s7, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v2, s7, v2, -v8 -; GFX900-GISEL-NEXT: v_fma_f32 v2, s7, v3, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, s3, v2 +; GFX900-GISEL-NEXT: v_fma_f32 v2, s3, v2, -v8 +; GFX900-GISEL-NEXT: v_fma_f32 v2, s3, v3, v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v8 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v8, v8, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v8, v2 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v8, v2 -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s5, v5 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s1, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v5 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v3, v8, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp10_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; SI-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 +; SI-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; SI-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0xc23369f4 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v4 +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, s2, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v3 ; SI-SDAG-NEXT: v_sub_f32_e32 v7, v3, v6 -; SI-SDAG-NEXT: v_fma_f32 v3, s6, v0, -v3 -; SI-SDAG-NEXT: v_fma_f32 v3, s6, v1, v3 +; SI-SDAG-NEXT: v_fma_f32 v3, s2, v0, -v3 +; SI-SDAG-NEXT: v_fma_f32 v3, s2, v1, v3 ; SI-SDAG-NEXT: v_add_f32_e32 v3, v7, v3 ; SI-SDAG-NEXT: v_exp_f32_e32 v7, v3 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v5 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v5 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v8, v2, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v7, v6 -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s1, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v7, v6 ; SI-SDAG-NEXT: v_sub_f32_e32 v9, v6, v7 -; SI-SDAG-NEXT: v_fma_f32 v6, s5, v0, -v6 -; SI-SDAG-NEXT: v_fma_f32 v6, s5, v1, v6 +; SI-SDAG-NEXT: v_fma_f32 v6, s1, v0, -v6 +; SI-SDAG-NEXT: v_fma_f32 v6, s1, v1, v6 ; SI-SDAG-NEXT: v_add_f32_e32 v6, v9, v6 ; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v7 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v4 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v5 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v6, v6, v7 -; SI-SDAG-NEXT: v_mul_f32_e32 v7, s4, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v7, s0, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v9, v7 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v7 +; SI-SDAG-NEXT: v_fma_f32 v0, s0, v0, -v7 ; SI-SDAG-NEXT: v_sub_f32_e32 v10, v7, v9 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; SI-SDAG-NEXT: v_fma_f32 v0, s0, v1, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v10, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v9 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v4 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s1, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s5, v5 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s1, v5 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v7 -; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v4 +; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v4 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v5 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp10_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37 ; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 -; SI-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v2 +; SI-GISEL-NEXT: v_fma_f32 v1, s0, v2, -v0 ; SI-GISEL-NEXT: v_rndne_f32_e32 v4, v0 -; SI-GISEL-NEXT: v_fma_f32 v1, s4, v3, v1 +; SI-GISEL-NEXT: v_fma_f32 v1, s0, v3, v1 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v4 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s5, v2 -; SI-GISEL-NEXT: v_fma_f32 v6, s5, v2, -v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, s1, v2 +; SI-GISEL-NEXT: v_fma_f32 v6, s1, v2, -v1 ; SI-GISEL-NEXT: v_rndne_f32_e32 v7, v1 -; SI-GISEL-NEXT: v_fma_f32 v6, s5, v3, v6 +; SI-GISEL-NEXT: v_fma_f32 v6, s1, v3, v6 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v7 ; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v6 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; SI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v5 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v6 -; SI-GISEL-NEXT: v_mul_f32_e32 v6, s6, v2 -; SI-GISEL-NEXT: v_fma_f32 v8, s6, v2, -v6 +; SI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 +; SI-GISEL-NEXT: v_fma_f32 v8, s2, v2, -v6 ; SI-GISEL-NEXT: v_rndne_f32_e32 v9, v6 -; SI-GISEL-NEXT: v_fma_f32 v8, s6, v3, v8 +; SI-GISEL-NEXT: v_fma_f32 v8, s2, v3, v8 ; SI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v9 ; SI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v8, v9 ; SI-GISEL-NEXT: v_exp_f32_e32 v6, v6 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; SI-GISEL-NEXT: v_ldexp_f32_e32 v6, v6, v8 -; SI-GISEL-NEXT: v_mul_f32_e32 v8, s7, v2 -; SI-GISEL-NEXT: v_fma_f32 v2, s7, v2, -v8 -; SI-GISEL-NEXT: v_fma_f32 v2, s7, v3, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v8, s3, v2 +; SI-GISEL-NEXT: v_fma_f32 v2, s3, v2, -v8 +; SI-GISEL-NEXT: v_fma_f32 v2, s3, v3, v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v8 ; SI-GISEL-NEXT: v_sub_f32_e32 v8, v8, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v2, v8, v2 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v8, v2 -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s5, v5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s1, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; SI-GISEL-NEXT: v_ldexp_f32_e32 v3, v8, v3 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp10_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 32b599e63c61d2..30cc060d05bb15 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -12,17 +12,17 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_exp2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 @@ -31,35 +31,35 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp2_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -69,14 +69,14 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp2_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc @@ -88,17 +88,17 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] @@ -106,7 +106,7 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-GISEL-LABEL: s_exp2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 @@ -173,7 +173,7 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -199,7 +199,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -223,7 +223,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 @@ -247,7 +247,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -271,40 +271,40 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s6, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[0:1] -; GFX900-GISEL-NEXT: v_add_f32_e32 v3, s6, v3 -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s7, v0 +; GFX900-GISEL-NEXT: v_add_f32_e32 v3, s10, v3 +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s11, v0 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc @@ -312,7 +312,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v3, v0 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_v2f32: @@ -381,150 +381,150 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v4, s5, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v0, s6, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v4, s1, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v6, s4, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v6, s0, v6 ; SI-SDAG-NEXT: v_exp_f32_e32 v3, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, v4, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, v3, v7 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v6, v5 -; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v1 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v1 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc -; SI-GISEL-NEXT: v_add_f32_e32 v4, s5, v4 +; SI-GISEL-NEXT: v_add_f32_e32 v4, s1, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1] ; SI-GISEL-NEXT: v_exp_f32_e32 v4, v4 -; SI-GISEL-NEXT: v_add_f32_e32 v1, s6, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v1, s2, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v2, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc ; SI-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1] -; SI-GISEL-NEXT: s_mov_b32 s10, -1 -; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 -; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 -; SI-GISEL-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 +; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v4, s6, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v4, s2, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v6, s5, v6 +; VI-SDAG-NEXT: v_add_f32_e32 v6, s1, v6 ; VI-SDAG-NEXT: v_exp_f32_e32 v3, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v6, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v1 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v4, s1, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1] -; VI-GISEL-NEXT: v_add_f32_e32 v4, s5, v4 -; VI-GISEL-NEXT: v_add_f32_e32 v1, s6, v1 +; VI-GISEL-NEXT: v_add_f32_e32 v1, s2, v1 ; VI-GISEL-NEXT: v_exp_f32_e32 v4, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v2, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1] ; VI-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_exp2_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s6, v4 -; GFX900-SDAG-NEXT: v_add_f32_e32 v6, s5, v6 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s2, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v6, s1, v6 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v6, v6 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, v1 @@ -532,29 +532,29 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v1 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v4, s1, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1] -; GFX900-GISEL-NEXT: v_add_f32_e32 v4, s5, v4 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s6, v1 +; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v4, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc @@ -562,7 +562,7 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_v3f32: @@ -656,8 +656,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -693,104 +693,104 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s8, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[0:1] ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_add_f32_e32 v1, s5, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v1, s9, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc ; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] -; SI-GISEL-NEXT: v_add_f32_e32 v5, s6, v5 -; SI-GISEL-NEXT: v_add_f32_e32 v2, s7, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v5, s10, v5 +; SI-GISEL-NEXT: v_add_f32_e32 v2, s11, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v5, v5 ; SI-GISEL-NEXT: v_exp_f32_e32 v3, v2 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1] ; SI-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 -; SI-GISEL-NEXT: s_mov_b32 s10, -1 -; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v6, s6, v6 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v6, s2, v6 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; VI-SDAG-NEXT: v_exp_f32_e32 v6, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v8, s5, v8 -; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v8, s1, v8 +; VI-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v8, v8 ; VI-SDAG-NEXT: v_exp_f32_e32 v9, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp2_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v0, s8, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[0:1] ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_add_f32_e32 v1, s5, v1 +; VI-GISEL-NEXT: v_add_f32_e32 v1, s9, v1 ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc ; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1] -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] -; VI-GISEL-NEXT: v_add_f32_e32 v5, s6, v5 -; VI-GISEL-NEXT: v_add_f32_e32 v2, s7, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v5, s10, v5 +; VI-GISEL-NEXT: v_add_f32_e32 v2, s11, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5 ; VI-GISEL-NEXT: v_exp_f32_e32 v3, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc @@ -804,28 +804,28 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v3, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v3, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX900-SDAG-NEXT: v_add_f32_e32 v5, s7, v5 -; GFX900-SDAG-NEXT: v_add_f32_e32 v7, s6, v7 -; GFX900-SDAG-NEXT: v_add_f32_e32 v9, s5, v9 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v5, s3, v5 +; GFX900-SDAG-NEXT: v_add_f32_e32 v7, s2, v7 +; GFX900-SDAG-NEXT: v_add_f32_e32 v9, s1, v9 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v5, v5 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v7, v7 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v9, v9 @@ -835,35 +835,35 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v7, v6 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v9, v8 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v2 -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s8, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[0:1] ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s5, v1 +; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1] -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] -; GFX900-GISEL-NEXT: v_add_f32_e32 v5, s6, v5 -; GFX900-GISEL-NEXT: v_add_f32_e32 v2, s7, v2 +; GFX900-GISEL-NEXT: v_add_f32_e32 v5, s10, v5 +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, s11, v2 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v5 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v2 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc @@ -871,7 +871,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll index 6036150e189054..4f206b82fdd601 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -10,7 +10,7 @@ declare <2 x half> @llvm.floor.v2f16(<2 x half> %a) define amdgpu_kernel void @floor_f16( ; SI-LABEL: floor_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @floor_f16( ; ; VI-LABEL: floor_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -48,7 +48,7 @@ define amdgpu_kernel void @floor_f16( ; ; GFX11-LABEL: floor_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -66,7 +66,7 @@ define amdgpu_kernel void @floor_f16( ; ; GFX11-FAKE16-LABEL: floor_f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -101,7 +101,7 @@ entry: define amdgpu_kernel void @floor_v2f16( ; SI-LABEL: floor_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -127,7 +127,7 @@ define amdgpu_kernel void @floor_v2f16( ; ; VI-LABEL: floor_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -147,7 +147,7 @@ define amdgpu_kernel void @floor_v2f16( ; ; GFX11-LABEL: floor_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -172,7 +172,7 @@ define amdgpu_kernel void @floor_v2f16( ; ; GFX11-FAKE16-LABEL: floor_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index 10f87e74f39d81..814f44477f5289 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -14,7 +14,7 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> define amdgpu_kernel void @fmuladd_f16( ; SI-LABEL: fmuladd_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -48,7 +48,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; VI-FLUSH-LABEL: fmuladd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 ; VI-FLUSH-NEXT: s_mov_b32 s10, -1 ; VI-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -76,7 +76,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; VI-DENORM-LABEL: fmuladd_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 ; VI-DENORM-NEXT: s_mov_b32 s10, -1 ; VI-DENORM-NEXT: s_mov_b32 s14, s10 @@ -104,27 +104,27 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX10-FLUSH-LABEL: fmuladd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1 ; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 -; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s6, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s7, s3 ; GFX10-FLUSH-NEXT: s_mov_b32 s18, s2 ; GFX10-FLUSH-NEXT: s_mov_b32 s19, s3 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 -; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 -; GFX10-FLUSH-NEXT: s_mov_b32 s16, s8 -; GFX10-FLUSH-NEXT: s_mov_b32 s17, s9 -; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; GFX10-FLUSH-NEXT: s_mov_b32 s4, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s5, s11 +; GFX10-FLUSH-NEXT: s_mov_b32 s16, s12 +; GFX10-FLUSH-NEXT: s_mov_b32 s17, s13 +; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0 -; GFX10-FLUSH-NEXT: s_mov_b32 s8, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s9, s11 -; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2 -; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3 -; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4 -; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[8:11], 0 -; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5 +; GFX10-FLUSH-NEXT: s_mov_b32 s12, s14 +; GFX10-FLUSH-NEXT: s_mov_b32 s13, s15 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s0, s8 +; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[12:15], 0 +; GFX10-FLUSH-NEXT: s_mov_b32 s1, s9 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) @@ -134,27 +134,27 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX10-DENORM-LABEL: fmuladd_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DENORM-NEXT: s_mov_b32 s2, -1 ; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-DENORM-NEXT: s_mov_b32 s14, s2 -; GFX10-DENORM-NEXT: s_mov_b32 s15, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s6, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s7, s3 ; GFX10-DENORM-NEXT: s_mov_b32 s18, s2 ; GFX10-DENORM-NEXT: s_mov_b32 s19, s3 ; GFX10-DENORM-NEXT: s_mov_b32 s22, s2 ; GFX10-DENORM-NEXT: s_mov_b32 s23, s3 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 -; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 -; GFX10-DENORM-NEXT: s_mov_b32 s16, s8 -; GFX10-DENORM-NEXT: s_mov_b32 s17, s9 -; GFX10-DENORM-NEXT: s_mov_b32 s20, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s21, s11 -; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; GFX10-DENORM-NEXT: s_mov_b32 s4, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s5, s11 +; GFX10-DENORM-NEXT: s_mov_b32 s16, s12 +; GFX10-DENORM-NEXT: s_mov_b32 s17, s13 +; GFX10-DENORM-NEXT: s_mov_b32 s20, s14 +; GFX10-DENORM-NEXT: s_mov_b32 s21, s15 +; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0 ; GFX10-DENORM-NEXT: buffer_load_ushort v2, off, s[20:23], 0 -; GFX10-DENORM-NEXT: s_mov_b32 s0, s4 -; GFX10-DENORM-NEXT: s_mov_b32 s1, s5 +; GFX10-DENORM-NEXT: s_mov_b32 s0, s8 +; GFX10-DENORM-NEXT: s_mov_b32 s1, s9 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1 ; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[0:3], 0 @@ -162,7 +162,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX11-FLUSH-LABEL: fmuladd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -193,7 +193,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX11-DENORM-LABEL: fmuladd_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 @@ -233,146 +233,146 @@ define amdgpu_kernel void @fmuladd_f16( define amdgpu_kernel void @fmuladd_f16_imm_a( ; SI-LABEL: fmuladd_f16_imm_a: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-FLUSH-LABEL: fmuladd_f16_imm_a: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 -; VI-FLUSH-NEXT: s_mov_b32 s10, -1 -; VI-FLUSH-NEXT: s_mov_b32 s14, s10 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s14, s6 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s12, s6 -; VI-FLUSH-NEXT: s_mov_b32 s13, s7 -; VI-FLUSH-NEXT: s_mov_b32 s15, s11 -; VI-FLUSH-NEXT: s_mov_b32 s2, s10 -; VI-FLUSH-NEXT: s_mov_b32 s3, s11 +; VI-FLUSH-NEXT: s_mov_b32 s12, s2 +; VI-FLUSH-NEXT: s_mov_b32 s13, s3 +; VI-FLUSH-NEXT: s_mov_b32 s15, s7 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s8, s4 -; VI-FLUSH-NEXT: s_mov_b32 s9, s5 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 ; VI-FLUSH-NEXT: v_madmk_f16 v0, v0, 0x4200, v1 -; VI-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-FLUSH-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-FLUSH-NEXT: s_endpgm ; ; VI-DENORM-LABEL: fmuladd_f16_imm_a: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 -; VI-DENORM-NEXT: s_mov_b32 s10, -1 -; VI-DENORM-NEXT: s_mov_b32 s14, s10 +; VI-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-DENORM-NEXT: s_mov_b32 s7, 0xf000 +; VI-DENORM-NEXT: s_mov_b32 s6, -1 +; VI-DENORM-NEXT: s_mov_b32 s14, s6 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_mov_b32 s12, s6 -; VI-DENORM-NEXT: s_mov_b32 s13, s7 -; VI-DENORM-NEXT: s_mov_b32 s15, s11 -; VI-DENORM-NEXT: s_mov_b32 s2, s10 -; VI-DENORM-NEXT: s_mov_b32 s3, s11 +; VI-DENORM-NEXT: s_mov_b32 s12, s2 +; VI-DENORM-NEXT: s_mov_b32 s13, s3 +; VI-DENORM-NEXT: s_mov_b32 s15, s7 +; VI-DENORM-NEXT: s_mov_b32 s10, s6 +; VI-DENORM-NEXT: s_mov_b32 s11, s7 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: s_mov_b32 s4, s0 ; VI-DENORM-NEXT: s_movk_i32 s0, 0x4200 -; VI-DENORM-NEXT: s_mov_b32 s8, s4 -; VI-DENORM-NEXT: s_mov_b32 s9, s5 +; VI-DENORM-NEXT: s_mov_b32 s5, s1 ; VI-DENORM-NEXT: v_fma_f16 v0, v0, s0, v1 -; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-DENORM-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-DENORM-NEXT: s_endpgm ; ; GFX10-FLUSH-LABEL: fmuladd_f16_imm_a: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 -; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX10-FLUSH-NEXT: s_mov_b32 s2, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s3, s11 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX10-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s6 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s7 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, s7 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 -; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 -; GFX10-FLUSH-NEXT: s_mov_b32 s8, s4 +; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s4, s0 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s9, s5 +; GFX10-FLUSH-NEXT: s_mov_b32 s5, s1 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmuladd_f16_imm_a: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 -; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX10-DENORM-NEXT: s_mov_b32 s2, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s3, s11 +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX10-DENORM-NEXT: s_mov_b32 s6, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s6 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s7 +; GFX10-DENORM-NEXT: s_mov_b32 s10, s6 +; GFX10-DENORM-NEXT: s_mov_b32 s11, s7 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 -; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 +; GFX10-DENORM-NEXT: s_mov_b32 s12, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s13, s3 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s8, s4 -; GFX10-DENORM-NEXT: s_mov_b32 s9, s5 +; GFX10-DENORM-NEXT: s_mov_b32 s4, s0 +; GFX10-DENORM-NEXT: s_mov_b32 s5, s1 ; GFX10-DENORM-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1 -; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_a: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 ; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX11-FLUSH-NEXT: s_mov_b32 s2, s10 -; GFX11-FLUSH-NEXT: s_mov_b32 s3, s11 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6 -; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7 -; GFX11-FLUSH-NEXT: s_mov_b32 s8, s4 +; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0 ; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: s_mov_b32 s9, s5 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 @@ -382,23 +382,23 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( ; GFX11-DENORM-LABEL: fmuladd_f16_imm_a: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DENORM-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 ; GFX11-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX11-DENORM-NEXT: s_mov_b32 s2, s10 -; GFX11-DENORM-NEXT: s_mov_b32 s3, s11 +; GFX11-DENORM-NEXT: s_mov_b32 s6, s10 +; GFX11-DENORM-NEXT: s_mov_b32 s7, s11 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_mov_b32 s12, s6 -; GFX11-DENORM-NEXT: s_mov_b32 s13, s7 +; GFX11-DENORM-NEXT: s_mov_b32 s12, s2 +; GFX11-DENORM-NEXT: s_mov_b32 s13, s3 ; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-NEXT: s_mov_b32 s8, s4 -; GFX11-DENORM-NEXT: s_mov_b32 s9, s5 +; GFX11-DENORM-NEXT: s_mov_b32 s8, s0 +; GFX11-DENORM-NEXT: s_mov_b32 s9, s1 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 ; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0 ; GFX11-DENORM-NEXT: s_endpgm @@ -415,146 +415,146 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( define amdgpu_kernel void @fmuladd_f16_imm_b( ; SI-LABEL: fmuladd_f16_imm_b: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-FLUSH-LABEL: fmuladd_f16_imm_b: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 -; VI-FLUSH-NEXT: s_mov_b32 s10, -1 -; VI-FLUSH-NEXT: s_mov_b32 s14, s10 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s14, s6 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s12, s6 -; VI-FLUSH-NEXT: s_mov_b32 s13, s7 -; VI-FLUSH-NEXT: s_mov_b32 s15, s11 -; VI-FLUSH-NEXT: s_mov_b32 s2, s10 -; VI-FLUSH-NEXT: s_mov_b32 s3, s11 +; VI-FLUSH-NEXT: s_mov_b32 s12, s2 +; VI-FLUSH-NEXT: s_mov_b32 s13, s3 +; VI-FLUSH-NEXT: s_mov_b32 s15, s7 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s8, s4 -; VI-FLUSH-NEXT: s_mov_b32 s9, s5 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 ; VI-FLUSH-NEXT: v_madmk_f16 v0, v0, 0x4200, v1 -; VI-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-FLUSH-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-FLUSH-NEXT: s_endpgm ; ; VI-DENORM-LABEL: fmuladd_f16_imm_b: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 -; VI-DENORM-NEXT: s_mov_b32 s10, -1 -; VI-DENORM-NEXT: s_mov_b32 s14, s10 +; VI-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-DENORM-NEXT: s_mov_b32 s7, 0xf000 +; VI-DENORM-NEXT: s_mov_b32 s6, -1 +; VI-DENORM-NEXT: s_mov_b32 s14, s6 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_mov_b32 s12, s6 -; VI-DENORM-NEXT: s_mov_b32 s13, s7 -; VI-DENORM-NEXT: s_mov_b32 s15, s11 -; VI-DENORM-NEXT: s_mov_b32 s2, s10 -; VI-DENORM-NEXT: s_mov_b32 s3, s11 +; VI-DENORM-NEXT: s_mov_b32 s12, s2 +; VI-DENORM-NEXT: s_mov_b32 s13, s3 +; VI-DENORM-NEXT: s_mov_b32 s15, s7 +; VI-DENORM-NEXT: s_mov_b32 s10, s6 +; VI-DENORM-NEXT: s_mov_b32 s11, s7 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: s_mov_b32 s4, s0 ; VI-DENORM-NEXT: s_movk_i32 s0, 0x4200 -; VI-DENORM-NEXT: s_mov_b32 s8, s4 -; VI-DENORM-NEXT: s_mov_b32 s9, s5 +; VI-DENORM-NEXT: s_mov_b32 s5, s1 ; VI-DENORM-NEXT: v_fma_f16 v0, v0, s0, v1 -; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-DENORM-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-DENORM-NEXT: s_endpgm ; ; GFX10-FLUSH-LABEL: fmuladd_f16_imm_b: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 -; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX10-FLUSH-NEXT: s_mov_b32 s2, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s3, s11 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX10-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s6 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s7 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, s7 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 -; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 -; GFX10-FLUSH-NEXT: s_mov_b32 s8, s4 +; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s4, s0 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s9, s5 +; GFX10-FLUSH-NEXT: s_mov_b32 s5, s1 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmuladd_f16_imm_b: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 -; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX10-DENORM-NEXT: s_mov_b32 s2, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s3, s11 +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX10-DENORM-NEXT: s_mov_b32 s6, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s6 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s7 +; GFX10-DENORM-NEXT: s_mov_b32 s10, s6 +; GFX10-DENORM-NEXT: s_mov_b32 s11, s7 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 -; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 +; GFX10-DENORM-NEXT: s_mov_b32 s12, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s13, s3 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s8, s4 -; GFX10-DENORM-NEXT: s_mov_b32 s9, s5 +; GFX10-DENORM-NEXT: s_mov_b32 s4, s0 +; GFX10-DENORM-NEXT: s_mov_b32 s5, s1 ; GFX10-DENORM-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1 -; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_b: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 ; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX11-FLUSH-NEXT: s_mov_b32 s2, s10 -; GFX11-FLUSH-NEXT: s_mov_b32 s3, s11 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6 -; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7 -; GFX11-FLUSH-NEXT: s_mov_b32 s8, s4 +; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0 ; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: s_mov_b32 s9, s5 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 @@ -564,23 +564,23 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; GFX11-DENORM-LABEL: fmuladd_f16_imm_b: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DENORM-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 ; GFX11-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX11-DENORM-NEXT: s_mov_b32 s2, s10 -; GFX11-DENORM-NEXT: s_mov_b32 s3, s11 +; GFX11-DENORM-NEXT: s_mov_b32 s6, s10 +; GFX11-DENORM-NEXT: s_mov_b32 s7, s11 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_mov_b32 s12, s6 -; GFX11-DENORM-NEXT: s_mov_b32 s13, s7 +; GFX11-DENORM-NEXT: s_mov_b32 s12, s2 +; GFX11-DENORM-NEXT: s_mov_b32 s13, s3 ; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-NEXT: s_mov_b32 s8, s4 -; GFX11-DENORM-NEXT: s_mov_b32 s9, s5 +; GFX11-DENORM-NEXT: s_mov_b32 s8, s0 +; GFX11-DENORM-NEXT: s_mov_b32 s9, s1 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 ; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0 ; GFX11-DENORM-NEXT: s_endpgm @@ -597,7 +597,7 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( define amdgpu_kernel void @fmuladd_v2f16( ; SI-LABEL: fmuladd_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -641,7 +641,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; VI-FLUSH-LABEL: fmuladd_v2f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 ; VI-FLUSH-NEXT: s_mov_b32 s10, -1 ; VI-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -674,7 +674,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; VI-DENORM-LABEL: fmuladd_v2f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 ; VI-DENORM-NEXT: s_mov_b32 s10, -1 ; VI-DENORM-NEXT: s_mov_b32 s14, s10 @@ -710,27 +710,27 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX10-FLUSH-LABEL: fmuladd_v2f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1 ; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 -; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s6, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s7, s3 ; GFX10-FLUSH-NEXT: s_mov_b32 s18, s2 ; GFX10-FLUSH-NEXT: s_mov_b32 s19, s3 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 -; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 -; GFX10-FLUSH-NEXT: s_mov_b32 s16, s8 -; GFX10-FLUSH-NEXT: s_mov_b32 s17, s9 -; GFX10-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; GFX10-FLUSH-NEXT: s_mov_b32 s4, s10 +; GFX10-FLUSH-NEXT: s_mov_b32 s5, s11 +; GFX10-FLUSH-NEXT: s_mov_b32 s16, s12 +; GFX10-FLUSH-NEXT: s_mov_b32 s17, s13 +; GFX10-FLUSH-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX10-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 -; GFX10-FLUSH-NEXT: s_mov_b32 s8, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s9, s11 -; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2 -; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3 -; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4 -; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[8:11], 0 -; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5 +; GFX10-FLUSH-NEXT: s_mov_b32 s12, s14 +; GFX10-FLUSH-NEXT: s_mov_b32 s13, s15 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s0, s8 +; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; GFX10-FLUSH-NEXT: s_mov_b32 s1, s9 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1) ; GFX10-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) @@ -740,27 +740,27 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX10-DENORM-LABEL: fmuladd_v2f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-DENORM-NEXT: s_mov_b32 s2, -1 ; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-DENORM-NEXT: s_mov_b32 s14, s2 -; GFX10-DENORM-NEXT: s_mov_b32 s15, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s6, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s7, s3 ; GFX10-DENORM-NEXT: s_mov_b32 s18, s2 ; GFX10-DENORM-NEXT: s_mov_b32 s19, s3 ; GFX10-DENORM-NEXT: s_mov_b32 s22, s2 ; GFX10-DENORM-NEXT: s_mov_b32 s23, s3 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 -; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 -; GFX10-DENORM-NEXT: s_mov_b32 s16, s8 -; GFX10-DENORM-NEXT: s_mov_b32 s17, s9 -; GFX10-DENORM-NEXT: s_mov_b32 s20, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s21, s11 -; GFX10-DENORM-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; GFX10-DENORM-NEXT: s_mov_b32 s4, s10 +; GFX10-DENORM-NEXT: s_mov_b32 s5, s11 +; GFX10-DENORM-NEXT: s_mov_b32 s16, s12 +; GFX10-DENORM-NEXT: s_mov_b32 s17, s13 +; GFX10-DENORM-NEXT: s_mov_b32 s20, s14 +; GFX10-DENORM-NEXT: s_mov_b32 s21, s15 +; GFX10-DENORM-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX10-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; GFX10-DENORM-NEXT: buffer_load_dword v2, off, s[20:23], 0 -; GFX10-DENORM-NEXT: s_mov_b32 s0, s4 -; GFX10-DENORM-NEXT: s_mov_b32 s1, s5 +; GFX10-DENORM-NEXT: s_mov_b32 s0, s8 +; GFX10-DENORM-NEXT: s_mov_b32 s1, s9 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2 ; GFX10-DENORM-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -768,7 +768,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX11-FLUSH-LABEL: fmuladd_v2f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -799,7 +799,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX11-DENORM-LABEL: fmuladd_v2f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll index ccd30d3d8bea68..9187d8fd4f9fef 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll @@ -83,7 +83,7 @@ define i32 @strictfp_func_fpmode_i32() strictfp { define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; GFX6-LABEL: kernel_fpmode_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19) ; GFX6-NEXT: s_and_b32 s4, 0x7f3ff, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -95,7 +95,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX7-LABEL: kernel_fpmode_i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19) ; GFX7-NEXT: s_and_b32 s4, 0x7f3ff, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX8-LABEL: kernel_fpmode_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 19) ; GFX8-NEXT: s_and_b32 s2, 0x7f3ff, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -119,7 +119,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX9-LABEL: kernel_fpmode_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24) ; GFX9-NEXT: s_and_b32 s2, 0x87f3ff, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -130,7 +130,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX10-LABEL: kernel_fpmode_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_and_b32 s2, 0x87f3ff, s2 @@ -141,7 +141,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX11-LABEL: kernel_fpmode_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24) ; GFX11-NEXT: s_and_b32 s2, 0x87f3ff, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index 86311ab859258d..f416131e3d3140 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -14,12 +14,12 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX7CHECK-LABEL: sgpr_isnan_bf16: ; GFX7CHECK: ; %bb.0: -; GFX7CHECK-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7CHECK-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX7CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7CHECK-NEXT: s_mov_b32 s3, 0xf000 ; GFX7CHECK-NEXT: s_mov_b32 s2, -1 ; GFX7CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX7CHECK-NEXT: s_and_b32 s4, s4, 0x7fff +; GFX7CHECK-NEXT: s_and_b32 s4, s6, 0x7fff ; GFX7CHECK-NEXT: s_cmpk_gt_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -28,10 +28,10 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; ; GFX8CHECK-LABEL: sgpr_isnan_bf16: ; GFX8CHECK: ; %bb.0: -; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX8CHECK-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX8CHECK-NEXT: s_cmpk_gt_i32 s2, 0x7f80 ; GFX8CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 @@ -42,11 +42,11 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_bf16: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX9CHECK-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX9CHECK-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX9CHECK-NEXT: s_cmpk_gt_i32 s2, 0x7f80 ; GFX9CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] @@ -56,11 +56,11 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX10CHECK-LABEL: sgpr_isnan_bf16: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_clause 0x1 -; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX10CHECK-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX10CHECK-NEXT: s_cmpk_gt_i32 s2, 0x7f80 ; GFX10CHECK-NEXT: s_cselect_b32 s2, -1, 0 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 @@ -70,11 +70,11 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX11CHECK-LABEL: sgpr_isnan_bf16: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11CHECK-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11CHECK-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11CHECK-NEXT: s_cmpk_gt_i32 s2, 0x7f80 ; GFX11CHECK-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index a577fb3d190ab9..b7c566f682e349 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -13,12 +13,12 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f16: ; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7SELDAG-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7SELDAG-NEXT: s_mov_b32 s3, 0xf000 ; GFX7SELDAG-NEXT: s_mov_b32 s2, -1 ; GFX7SELDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX7SELDAG-NEXT: s_and_b32 s4, s4, 0x7fff +; GFX7SELDAG-NEXT: s_and_b32 s4, s6, 0x7fff ; GFX7SELDAG-NEXT: s_cmpk_gt_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -27,11 +27,11 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; ; GFX7GLISEL-LABEL: sgpr_isnan_f16: ; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7GLISEL-NEXT: s_load_dword s3, s[4:5], 0xb +; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7GLISEL-NEXT: s_mov_b32 s2, -1 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX7GLISEL-NEXT: s_and_b32 s3, s4, 0x7fff +; GFX7GLISEL-NEXT: s_and_b32 s3, s3, 0x7fff ; GFX7GLISEL-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX7GLISEL-NEXT: s_cmpk_gt_u32 s3, 0x7c00 ; GFX7GLISEL-NEXT: s_cselect_b32 s3, 1, 0 @@ -43,10 +43,10 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; ; GFX8CHECK-LABEL: sgpr_isnan_f16: ; GFX8CHECK: ; %bb.0: -; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s4, 3 +; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -55,11 +55,11 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f16: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s4, 3 +; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3 ; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] ; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9CHECK-NEXT: s_endpgm @@ -67,11 +67,11 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX10CHECK-LABEL: sgpr_isnan_f16: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_clause 0x1 -; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s2, s4, 3 +; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm @@ -79,11 +79,11 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX11CHECK-LABEL: sgpr_isnan_f16: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11CHECK-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s4, 3 +; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll index 96551d5bf78539..d411601d9eabd2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll @@ -13,34 +13,34 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f32: ; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7SELDAG-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7SELDAG-NEXT: s_mov_b32 s3, 0xf000 ; GFX7SELDAG-NEXT: s_mov_b32 s2, -1 ; GFX7SELDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cmp_class_f32_e64 s[4:5], s4, 3 +; GFX7SELDAG-NEXT: v_cmp_class_f32_e64 s[4:5], s6, 3 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] ; GFX7SELDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7SELDAG-NEXT: s_endpgm ; ; GFX7GLISEL-LABEL: sgpr_isnan_f32: ; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7GLISEL-NEXT: s_load_dword s3, s[4:5], 0xb +; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX7GLISEL-NEXT: s_mov_b32 s2, -1 -; GFX7GLISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX7GLISEL-NEXT: v_cmp_class_f32_e64 s[4:5], s4, 3 +; GFX7GLISEL-NEXT: v_cmp_class_f32_e64 s[4:5], s3, 3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; GFX7GLISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX7GLISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7GLISEL-NEXT: s_endpgm ; ; GFX8CHECK-LABEL: sgpr_isnan_f32: ; GFX8CHECK: ; %bb.0: -; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 3 +; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -49,11 +49,11 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f32: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 3 +; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3 ; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] ; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9CHECK-NEXT: s_endpgm @@ -61,11 +61,11 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX10CHECK-LABEL: sgpr_isnan_f32: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_clause 0x1 -; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s2, s4, 3 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm @@ -73,11 +73,11 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX11CHECK-LABEL: sgpr_isnan_f32: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11CHECK-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s2, s4, 3 +; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3 ; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] @@ -91,7 +91,7 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f64: ; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7SELDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7SELDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX7SELDAG-NEXT: s_mov_b32 s6, -1 ; GFX7SELDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -104,7 +104,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX7GLISEL-LABEL: sgpr_isnan_f64: ; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7GLISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX7GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] @@ -115,7 +115,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX8SELDAG-LABEL: sgpr_isnan_f64: ; GFX8SELDAG: ; %bb.0: -; GFX8SELDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8SELDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -126,7 +126,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX8GLISEL-LABEL: sgpr_isnan_f64: ; GFX8GLISEL: ; %bb.0: -; GFX8GLISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8GLISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GLISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 ; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -137,17 +137,17 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f64: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[0:1], s[6:7], 3 -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9CHECK-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] +; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9CHECK-NEXT: s_endpgm ; ; GFX10CHECK-LABEL: sgpr_isnan_f64: ; GFX10CHECK: ; %bb.0: -; GFX10CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3 @@ -157,7 +157,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX11CHECK-LABEL: sgpr_isnan_f64: ; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 8f28208945fbcd..279ffeab51fb3c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -14,8 +14,8 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -42,8 +42,8 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf @@ -70,10 +70,10 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc @@ -101,10 +101,10 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc @@ -132,8 +132,8 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf @@ -154,13 +154,13 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf @@ -181,18 +181,18 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -204,7 +204,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s2 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -212,13 +212,13 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -228,7 +228,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -312,7 +312,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -392,7 +392,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 @@ -439,7 +439,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -486,27 +486,27 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217 ; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3377d1cf ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s7, v3 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s11, v3 ; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x7f800000 +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v3 ; GFX900-SDAG-NEXT: v_fma_f32 v5, v3, s2, -v4 ; GFX900-SDAG-NEXT: v_fma_f32 v5, v3, s3, v5 ; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v3|, s7 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v3|, s4 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s10, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc @@ -515,24 +515,24 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s2, -v3 ; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s3, v5 ; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s7 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 -; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s10, v2 ; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v2 @@ -541,9 +541,9 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s11, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x41b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc @@ -557,12 +557,12 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 @@ -595,7 +595,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX1100-GISEL-LABEL: s_log_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -739,8 +739,8 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -792,8 +792,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -845,16 +845,16 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 +; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s10, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 @@ -864,29 +864,29 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 ; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s8 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s5, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s9, v3 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5 ; VI-SDAG-NEXT: v_and_b32_e32 v5, 0xfffff000, v3 -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v6, v3, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s8 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[2:3] ; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 @@ -899,26 +899,25 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s8 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s6 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s7 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v4, v0, v3 @@ -932,20 +931,20 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v2, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s5, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s9, v3 ; VI-GISEL-NEXT: v_log_f32_e32 v3, v3 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v6 ; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v3 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v7, v3, v6 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc ; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v7 ; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317000, v7 ; VI-GISEL-NEXT: v_log_f32_e32 v2, v1 @@ -969,73 +968,73 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s8 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s9 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_log_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x3377d1cf -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s10, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x3f317217 +; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x7f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s6, -v4 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s7, v5 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s4, -v4 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s5, v5 ; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s10 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s9, v4 ; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x41b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317217, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s6, -v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s4, -v6 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s7, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s5, v7 ; GFX900-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s10 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v4, v6, s[2:3] ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s6, -v4 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s7, v6 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s4, -v4 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s5, v6 ; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s10 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 -; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] +; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 @@ -1045,17 +1044,17 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v1 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v2, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s5, v6 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s9, v6 ; GFX900-GISEL-NEXT: v_log_f32_e32 v6, v6 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x41b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v6 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v3, -v8 ; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v4, v9 @@ -1073,30 +1072,29 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log_v3f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s6 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s1, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s4, v2 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 @@ -1112,7 +1110,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1127,29 +1125,29 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log_v3f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s6 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317217, v0 @@ -1166,7 +1164,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1181,6 +1179,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm ; @@ -1341,8 +1340,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3377d1cf @@ -1405,8 +1404,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 @@ -1469,16 +1468,16 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s11, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_mov_b32 s7, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 @@ -1488,11 +1487,11 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 ; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s7 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s10, v3 ; VI-SDAG-NEXT: v_log_f32_e32 v4, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x41b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc @@ -1505,29 +1504,29 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v6 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, s9, v6 ; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s7 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] ; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, v6, v4 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] ; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3f317000, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s7 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[2:3] ; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 @@ -1540,25 +1539,25 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s7 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s8 -; VI-SDAG-NEXT: v_mov_b32_e32 v5, s9 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v4, v0, v1 @@ -1572,9 +1571,9 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc @@ -1587,22 +1586,22 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317000, v7 ; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v7, 1.0, v3, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v7, s6, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, s10, v7 ; VI-GISEL-NEXT: v_log_f32_e32 v7, v7 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] ; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v5, s[0:1] ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6 ; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v7 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; VI-GISEL-NEXT: v_sub_f32_e32 v8, v7, v6 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] ; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v10, 0x3805fdf4, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v9, v10, v9 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317000, v8 ; VI-GISEL-NEXT: v_log_f32_e32 v3, v2 @@ -1626,85 +1625,85 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] ; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s8 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s9 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_log_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3377d1cf +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s11, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x3f317217 +; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf ; GFX900-SDAG-NEXT: s_mov_b32 s11, 0x7f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x41b17218 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s7, -v3 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s10, v5 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s4, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s5, v5 ; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s11 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s10, v3 ; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v5 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s7, -v2 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s10, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s4, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s5, v7 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s5, v7 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s9, v7 ; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s11 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v7 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s7, -v5 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s4, -v5 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s10, v8 +; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s5, v8 ; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v8 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v7|, s11 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[2:3] ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s7, -v5 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s10, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s4, -v5 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s5, v7 ; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v7 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s11 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1] ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x3377d1cf ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -1714,9 +1713,9 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v7 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v6 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x41b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc @@ -1724,19 +1723,19 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v4, -v8 ; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v5, v9 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v9, s6, v9 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v9, s10, v9 ; GFX900-GISEL-NEXT: v_log_f32_e32 v9, v9 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v6 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[2:3] ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v7, s[0:1] -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v8 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v9 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v4, -v8 ; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v5, v10 @@ -1754,38 +1753,37 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log_v4f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s3 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s6 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s1, v2 :: v_dual_mul_f32 v3, s0, v3 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 @@ -1813,38 +1811,38 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log_v4f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 @@ -1873,6 +1871,7 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index e80d8e3bfb386e..df880164b196b3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -14,8 +14,8 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -42,8 +42,8 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log10_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf @@ -70,10 +70,10 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc @@ -101,10 +101,10 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log10_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc @@ -132,8 +132,8 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf @@ -154,13 +154,13 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf @@ -181,18 +181,18 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log10_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -204,7 +204,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s2 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -212,13 +212,13 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log10_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -228,7 +228,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -312,7 +312,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -392,7 +392,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 @@ -439,7 +439,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -486,27 +486,27 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a ; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3284fbcf ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s7, v3 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s11, v3 ; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x7f800000 +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v3 ; GFX900-SDAG-NEXT: v_fma_f32 v5, v3, s2, -v4 ; GFX900-SDAG-NEXT: v_fma_f32 v5, v3, s3, v5 ; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v3|, s7 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v3|, s4 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s10, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc @@ -515,24 +515,24 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s2, -v3 ; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s3, v5 ; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s7 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 -; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s10, v2 ; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v2 @@ -541,9 +541,9 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s11, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x411a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc @@ -557,12 +557,12 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log10_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 @@ -595,7 +595,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log10_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -739,8 +739,8 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -792,8 +792,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -845,16 +845,16 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 +; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s10, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 @@ -864,29 +864,29 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 ; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s8 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s5, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s9, v3 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5 ; VI-SDAG-NEXT: v_and_b32_e32 v5, 0xfffff000, v3 -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v6, v3, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s8 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[2:3] ; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 @@ -899,26 +899,25 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s8 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s6 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s7 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log10_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v4, v0, v3 @@ -932,20 +931,20 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v2, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s5, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s9, v3 ; VI-GISEL-NEXT: v_log_f32_e32 v3, v3 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x411a209b ; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v6 ; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v3 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v7, v3, v6 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc ; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v7 ; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v7 ; VI-GISEL-NEXT: v_log_f32_e32 v2, v1 @@ -969,73 +968,73 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s8 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s9 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_log10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x3284fbcf -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s10, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x3e9a209a +; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x7f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s6, -v4 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s7, v5 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s4, -v4 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s5, v5 ; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s10 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s9, v4 ; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x411a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s6, -v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s4, -v6 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s7, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s5, v7 ; GFX900-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s10 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v4, v6, s[2:3] ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s6, -v4 -; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s7, v6 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s4, -v4 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v0, s5, v6 ; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s10 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 -; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] +; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 @@ -1045,17 +1044,17 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v1 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v2, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s5, v6 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s9, v6 ; GFX900-GISEL-NEXT: v_log_f32_e32 v6, v6 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x411a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v6 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v3, -v8 ; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v4, v9 @@ -1073,30 +1072,29 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log10_v3f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s6 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s1, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s4, v2 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 @@ -1112,7 +1110,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1127,29 +1125,29 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log10_v3f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s6 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v0 @@ -1166,7 +1164,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1181,6 +1179,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm ; @@ -1341,8 +1340,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log10_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3284fbcf @@ -1405,8 +1404,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a @@ -1469,16 +1468,16 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s11, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_mov_b32 s7, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 @@ -1488,11 +1487,11 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 ; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s7 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s10, v3 ; VI-SDAG-NEXT: v_log_f32_e32 v4, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x411a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc @@ -1505,29 +1504,29 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v6 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, s9, v6 ; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s7 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] ; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, v6, v4 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] ; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3e9a2000, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s7 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[2:3] ; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 @@ -1540,25 +1539,25 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s7 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s8 -; VI-SDAG-NEXT: v_mov_b32_e32 v5, s9 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log10_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v4, v0, v1 @@ -1572,9 +1571,9 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x411a209b ; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc @@ -1587,22 +1586,22 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v7 ; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v7, 1.0, v3, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v7, s6, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, s10, v7 ; VI-GISEL-NEXT: v_log_f32_e32 v7, v7 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] ; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v5, s[0:1] ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6 ; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v7 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; VI-GISEL-NEXT: v_sub_f32_e32 v8, v7, v6 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] ; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v10, 0x369a84fb, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v9, v10, v9 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a2000, v8 ; VI-GISEL-NEXT: v_log_f32_e32 v3, v2 @@ -1626,85 +1625,85 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] ; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s8 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s9 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_log10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3284fbcf +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s11, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX900-SDAG-NEXT: s_mov_b32 s7, 0x3e9a209a +; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf ; GFX900-SDAG-NEXT: s_mov_b32 s11, 0x7f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x411a209b ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s7, -v3 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s10, v5 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s4, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s5, v5 ; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s11 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s10, v3 ; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v5 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s7, -v2 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s10, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s4, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s5, v7 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s5, v7 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s9, v7 ; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s11 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1] -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v7 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s7, -v5 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s4, -v5 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s10, v8 +; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s5, v8 ; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v8 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v7|, s11 ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[2:3] ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s7, -v5 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s10, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s4, -v5 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s5, v7 ; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v7 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s11 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1] ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x3284fbcf ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -1714,9 +1713,9 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v7 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v6 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x411a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc @@ -1724,19 +1723,19 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v4, -v8 ; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v5, v9 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v9, s6, v9 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v9, s10, v9 ; GFX900-GISEL-NEXT: v_log_f32_e32 v9, v9 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v6 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[2:3] ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v7, s[0:1] -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v8 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v9 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v4, -v8 ; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v5, v10 @@ -1754,38 +1753,37 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log10_v4f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s3 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s6 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s1, v2 :: v_dual_mul_f32 v3, s0, v3 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 @@ -1813,38 +1811,38 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log10_v4f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 @@ -1873,6 +1871,7 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 6578311178ab75..c5dea7fd8b4b1f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 @@ -33,35 +33,35 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log2_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -71,14 +71,14 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log2_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -90,17 +90,17 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] @@ -108,7 +108,7 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-GISEL-LABEL: s_log2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -116,7 +116,7 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -127,15 +127,15 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-SDAG-LABEL: s_log2_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -146,14 +146,14 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log2_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s2 ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -212,7 +212,7 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log2_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -238,7 +238,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -262,7 +262,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 @@ -286,7 +286,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -310,40 +310,40 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, s6, v3 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, s10, v3 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s11, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v3 ; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -351,12 +351,12 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v3, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 @@ -379,7 +379,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log2_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -465,150 +465,150 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log2_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s1, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s4, v6 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s0, v6 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v7 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v6, v5 -; SI-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 -; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v1 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v1 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 +; SI-GISEL-NEXT: v_mul_f32_e32 v4, s1, v4 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1] ; SI-GISEL-NEXT: v_log_f32_e32 v4, v4 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v2, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] -; SI-GISEL-NEXT: s_mov_b32 s10, -1 -; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 -; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 -; SI-GISEL-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 +; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s6, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, s1, v6 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v1 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s1, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v4, v4 ; VI-GISEL-NEXT: v_log_f32_e32 v2, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_log2_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s6, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v6 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s1, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 ; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v1 @@ -616,29 +616,29 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s5, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v1 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; GFX900-GISEL-NEXT: v_mul_f32_e32 v4, s1, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s6, v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v4, v4 ; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc @@ -646,68 +646,68 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_v3f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s6 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v5, s4, v5 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s6 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s2, v2 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s1, v4 :: v_dual_mul_f32 v5, s0, v5 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0 -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v2, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v5, v3 :: v_dual_sub_f32 v1, v4, v1 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1] +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3 +; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[4:5] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log2_v3f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s6 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v3 :: v_dual_mov_b32 v3, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm ; @@ -802,8 +802,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -839,104 +839,104 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, s10, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 ; SI-GISEL-NEXT: v_log_f32_e32 v5, v5 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v2 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] ; SI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 -; SI-GISEL-NEXT: s_mov_b32 s10, -1 -; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v3, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, s2, v6 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc ; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 ; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, s5, v8 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, s1, v8 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v8, v8 ; VI-SDAG-NEXT: v_log_f32_e32 v9, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log2_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, s10, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 ; VI-GISEL-NEXT: v_log_f32_e32 v5, v5 ; VI-GISEL-NEXT: v_log_f32_e32 v3, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc @@ -950,28 +950,28 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v3, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, s7, v5 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s6, v7 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v9, s5, v9 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, s3, v5 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s2, v7 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v9, s1, v9 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v5 ; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7 ; GFX900-SDAG-NEXT: v_log_f32_e32 v9, v9 @@ -981,35 +981,35 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v7, v6 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v9, v8 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s5, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s10, v5 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 ; GFX900-GISEL-NEXT: v_log_f32_e32 v5, v5 ; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v2 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc @@ -1017,77 +1017,79 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_v4f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s3 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s1 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s6 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s7 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s7, v2 :: v_dual_mul_f32 v3, s6, v3 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s6 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s3, v2 :: v_dual_mul_f32 v3, s2, v3 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s1, v6 :: v_dual_mul_f32 v7, s0, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s7 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0 ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1] +; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[4:5] ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log2_v4f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index bc0daf95e329c0..d90c4a75ac5dea 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -425,8 +425,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX7-LABEL: s_maximum_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s16 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -442,10 +442,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX8-LABEL: s_maximum_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_max_f16_e32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s17 +; GFX8-NEXT: v_max_f16_e32 v1, s16, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: ;;#ASMSTART @@ -456,10 +456,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX9-LABEL: s_maximum_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_max_f16_e32 v1, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 +; GFX9-NEXT: v_max_f16_e32 v1, s16, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: ;;#ASMSTART @@ -485,8 +485,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX10-LABEL: s_maximum_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e64 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 +; GFX10-NEXT: v_max_f16_e64 v0, s16, s17 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ;;#ASMSTART @@ -872,10 +872,10 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX7-LABEL: s_maximum_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s17 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, s16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, s6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s19 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s17 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, s18 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, s16 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -899,16 +899,16 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-LABEL: s_maximum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s4, s7, 16 -; GFX8-NEXT: s_lshr_b32 s5, s6, 16 +; GFX8-NEXT: s_lshr_b32 s4, s17, 16 +; GFX8-NEXT: s_lshr_b32 s5, s16, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_max_f16_e32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_max_f16_e32 v3, s6, v1 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_max_f16_e32 v3, s16, v1 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -920,13 +920,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX9-LABEL: s_maximum_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_lshr_b32 s4, s7, 16 -; GFX9-NEXT: v_pk_max_f16 v1, s6, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_lshr_b32 s4, s17, 16 +; GFX9-NEXT: v_pk_max_f16 v1, s16, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 -; GFX9-NEXT: s_lshr_b32 s5, s6, 16 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 +; GFX9-NEXT: s_lshr_b32 s5, s16, 16 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -965,10 +965,10 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-LABEL: s_maximum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 -; GFX10-NEXT: s_lshr_b32 s4, s7, 16 -; GFX10-NEXT: s_lshr_b32 s5, s6, 16 +; GFX10-NEXT: v_pk_max_f16 v0, s16, s17 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17 +; GFX10-NEXT: s_lshr_b32 s4, s17, 16 +; GFX10-NEXT: s_lshr_b32 s5, s16, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index 6b61931fc9414b..48851cb030233d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -401,10 +401,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX7-LABEL: s_maximum_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_max_f32_e32 v1, s6, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s17 +; GFX7-NEXT: v_max_f32_e32 v1, s16, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v0 @@ -414,10 +414,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX8-LABEL: s_maximum_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_max_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s17 +; GFX8-NEXT: v_max_f32_e32 v1, s16, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -427,10 +427,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX9-LABEL: s_maximum_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_max_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 +; GFX9-NEXT: v_max_f32_e32 v1, s16, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 @@ -454,8 +454,8 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX10-LABEL: s_maximum_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s7 +; GFX10-NEXT: v_max_f32_e64 v0, s16, s17 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s17 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v0 @@ -782,14 +782,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX7-LABEL: s_maximum_v2f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s17 -; GFX7-NEXT: v_max_f32_e32 v1, s7, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s19 +; GFX7-NEXT: v_max_f32_e32 v1, s17, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v3, s6, v0 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX7-NEXT: v_max_f32_e32 v3, s16, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v[0:1] @@ -799,14 +799,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8-LABEL: s_maximum_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s17 -; GFX8-NEXT: v_max_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s19 +; GFX8-NEXT: v_max_f32_e32 v1, s17, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_max_f32_e32 v3, s6, v0 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX8-NEXT: v_max_f32_e32 v3, s16, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] @@ -816,14 +816,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX9-LABEL: s_maximum_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_max_f32_e32 v1, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s19 +; GFX9-NEXT: v_max_f32_e32 v1, s17, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_max_f32_e32 v3, s6, v0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX9-NEXT: v_max_f32_e32 v3, s16, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] @@ -851,11 +851,11 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX10-LABEL: s_maximum_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v0, s7, s17 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s7, s17 -; GFX10-NEXT: v_max_f32_e64 v2, s6, s16 +; GFX10-NEXT: v_max_f32_e64 v0, s17, s19 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s17, s19 +; GFX10-NEXT: v_max_f32_e64 v2, s16, s18 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s16 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s18 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index 9a83c04cad1e3e..80a0a194713d90 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -427,10 +427,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX7-LABEL: s_maximum_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -442,10 +442,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX8-LABEL: s_maximum_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -457,10 +457,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX9-LABEL: s_maximum_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -487,8 +487,8 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX10-LABEL: s_maximum_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], s[6:7], s[16:17] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[6:7], s[16:17] +; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[18:19] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[18:19] ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 ; GFX10-NEXT: ;;#ASMSTART @@ -844,14 +844,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX7-LABEL: s_maximum_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mov_b32_e32 v5, s19 -; GFX7-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX7-NEXT: v_max_f64 v[0:1], s[6:7], v[4:5] -; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-NEXT: v_mov_b32_e32 v5, s21 +; GFX7-NEXT: v_max_f64 v[2:3], s[18:19], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], s[16:17], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -865,14 +865,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8-LABEL: s_maximum_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 -; GFX8-NEXT: v_mov_b32_e32 v1, s21 -; GFX8-NEXT: v_mov_b32_e32 v5, s19 -; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], s[6:7], v[4:5] -; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NEXT: v_max_f64 v[2:3], s[18:19], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], s[16:17], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -886,14 +886,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX9-LABEL: s_maximum_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s20 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s21 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: v_max_f64 v[0:1], s[6:7], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_max_f64 v[2:3], s[18:19], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] +; GFX9-NEXT: v_max_f64 v[0:1], s[16:17], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -907,11 +907,11 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX940-LABEL: s_maximum_v2f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[18:19] ; GFX940-NEXT: v_max_f64 v[2:3], s[2:3], v[0:1] ; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GFX940-NEXT: v_max_f64 v[4:5], s[0:1], v[0:1] ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -927,10 +927,10 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX10-LABEL: s_maximum_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[20:21] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[20:21] -; GFX10-NEXT: v_max_f64 v[4:5], s[6:7], s[18:19] -; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[6:7], s[18:19] +; GFX10-NEXT: v_max_f64 v[0:1], s[18:19], s[22:23] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[18:19], s[22:23] +; GFX10-NEXT: v_max_f64 v[4:5], s[16:17], s[20:21] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[16:17], s[20:21] ; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s5 @@ -943,10 +943,10 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX11-LABEL: s_maximum_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[16:17] -; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[16:17] -; GFX11-NEXT: v_max_f64 v[4:5], s[0:1], s[6:7] -; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[18:19] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[18:19] +; GFX11-NEXT: v_max_f64 v[4:5], s[0:1], s[16:17] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[16:17] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 @@ -964,8 +964,8 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[16:17] -; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[6:7] +; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[18:19] +; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[16:17] ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use v[0:3] ; GFX12-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 1daa45285e68aa..0517e41e3d651b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -13,126 +13,126 @@ declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) define amdgpu_kernel void @maxnum_f16( ; SI-LABEL: maxnum_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s14, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s6 +; GFX10-NEXT: s_mov_b32 s15, s7 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s12, s6 -; GFX10-NEXT: s_mov_b32 s13, s7 +; GFX10-NEXT: s_mov_b32 s12, s2 +; GFX10-NEXT: s_mov_b32 s13, s3 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX10-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -153,7 +153,7 @@ entry: define amdgpu_kernel void @maxnum_f16_imm_a( ; SI-LABEL: maxnum_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -173,7 +173,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; VI-LABEL: maxnum_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -192,45 +192,45 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; GFX9-LABEL: maxnum_f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, 0x4200, v0 -; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -259,7 +259,7 @@ entry: define amdgpu_kernel void @maxnum_f16_imm_b( ; SI-LABEL: maxnum_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -279,7 +279,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; VI-LABEL: maxnum_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -298,45 +298,45 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; GFX9-LABEL: maxnum_f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, 4.0, v0 -; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -365,37 +365,37 @@ entry: define amdgpu_kernel void @maxnum_v2f16( ; SI-LABEL: maxnum_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[6:7], 0x0 -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s1, s2, 16 -; SI-NEXT: s_lshr_b32 s3, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: s_lshr_b32 s5, s2, 16 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_max_f32_e32 v1, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s8, s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[8:9], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 @@ -414,46 +414,46 @@ define amdgpu_kernel void @maxnum_v2f16( ; ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 -; GFX9-NEXT: v_pk_max_f16 v1, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX10-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -478,7 +478,7 @@ entry: define amdgpu_kernel void @maxnum_v2f16_imm_a( ; SI-LABEL: maxnum_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -498,7 +498,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; VI-LABEL: maxnum_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -516,34 +516,34 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; GFX9-LABEL: maxnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s0, s0 -; GFX9-NEXT: s_mov_b32 s0, 0x44004200 -; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x44004200 +; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -566,7 +566,7 @@ entry: define amdgpu_kernel void @maxnum_v2f16_imm_b( ; SI-LABEL: maxnum_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -586,7 +586,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; VI-LABEL: maxnum_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -604,34 +604,34 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; GFX9-LABEL: maxnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s0, s0 -; GFX9-NEXT: s_mov_b32 s0, 0x42004400 -; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x42004400 +; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -655,22 +655,22 @@ entry: define amdgpu_kernel void @maxnum_v3f16( ; SI-LABEL: maxnum_v3f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; SI-NEXT: s_lshr_b32 s3, s2, 16 -; SI-NEXT: s_lshr_b32 s8, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: s_lshr_b32 s7, s6, 16 +; SI-NEXT: s_lshr_b32 s8, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 ; SI-NEXT: v_max_f32_e32 v1, v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v2, v3, v4 @@ -679,18 +679,18 @@ define amdgpu_kernel void @maxnum_v3f16( ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 -; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_v3f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 @@ -713,54 +713,54 @@ define amdgpu_kernel void @maxnum_v3f16( ; ; GFX9-LABEL: maxnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 -; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v2, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 -; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_max_f16 v1, v2, v1 ; GFX10-NEXT: v_pk_max_f16 v0, v3, v0 -; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 @@ -790,28 +790,28 @@ entry: define amdgpu_kernel void @maxnum_v4f16( ; SI-LABEL: maxnum_v4f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[10:11], 0x0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: s_lshr_b32 s2, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; SI-NEXT: s_lshr_b32 s2, s1, 16 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_lshr_b32 s6, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: s_lshr_b32 s6, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 ; SI-NEXT: v_max_f32_e32 v3, v3, v5 ; SI-NEXT: v_max_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -824,17 +824,17 @@ define amdgpu_kernel void @maxnum_v4f16( ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 @@ -862,52 +862,52 @@ define amdgpu_kernel void @maxnum_v4f16( ; ; GFX9-LABEL: maxnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s9, s9 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v2, s8, s8 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v0, s5, s5 ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v2, s4, s4 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v0 ; GFX10-NEXT: v_pk_max_f16 v0, v3, v2 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 @@ -935,7 +935,7 @@ entry: define amdgpu_kernel void @fmax_v4f16_imm_a( ; SI-LABEL: fmax_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -964,7 +964,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; VI-LABEL: fmax_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -991,41 +991,41 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; GFX9-LABEL: fmax_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 ; GFX9-NEXT: v_pk_max_f16 v1, v0, s8 ; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fmax_v4f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 -; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_max_f16 v1, 0x44004200, v0 ; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, v2 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fmax_v4f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 77b5682a2dbd17..a74043378a2598 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -351,10 +351,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX8-LABEL: s_minimum_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_min_f16_e32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s17 +; GFX8-NEXT: v_min_f16_e32 v1, s16, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: ;;#ASMSTART @@ -365,10 +365,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX9-LABEL: s_minimum_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_min_f16_e32 v1, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 +; GFX9-NEXT: v_min_f16_e32 v1, s16, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: ;;#ASMSTART @@ -394,8 +394,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX10-LABEL: s_minimum_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f16_e64 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 +; GFX10-NEXT: v_min_f16_e64 v0, s16, s17 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ;;#ASMSTART @@ -711,16 +711,16 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-LABEL: s_minimum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s4, s7, 16 -; GFX8-NEXT: s_lshr_b32 s5, s6, 16 +; GFX8-NEXT: s_lshr_b32 s4, s17, 16 +; GFX8-NEXT: s_lshr_b32 s5, s16, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_min_f16_e32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_min_f16_e32 v3, s6, v1 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_min_f16_e32 v3, s16, v1 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -732,13 +732,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX9-LABEL: s_minimum_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_lshr_b32 s4, s7, 16 -; GFX9-NEXT: v_pk_min_f16 v1, s6, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_lshr_b32 s4, s17, 16 +; GFX9-NEXT: v_pk_min_f16 v1, s16, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 -; GFX9-NEXT: s_lshr_b32 s5, s6, 16 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 +; GFX9-NEXT: s_lshr_b32 s5, s16, 16 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -777,10 +777,10 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-LABEL: s_minimum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 -; GFX10-NEXT: s_lshr_b32 s4, s7, 16 -; GFX10-NEXT: s_lshr_b32 s5, s6, 16 +; GFX10-NEXT: v_pk_min_f16 v0, s16, s17 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17 +; GFX10-NEXT: s_lshr_b32 s4, s17, 16 +; GFX10-NEXT: s_lshr_b32 s5, s16, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index 8753dc50c4da40..2b3041290b5866 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -401,10 +401,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX7-LABEL: s_minimum_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_min_f32_e32 v1, s6, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s17 +; GFX7-NEXT: v_min_f32_e32 v1, s16, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v0 @@ -414,10 +414,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX8-LABEL: s_minimum_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_min_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s17 +; GFX8-NEXT: v_min_f32_e32 v1, s16, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -427,10 +427,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX9-LABEL: s_minimum_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_min_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s17 +; GFX9-NEXT: v_min_f32_e32 v1, s16, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 @@ -454,8 +454,8 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX10-LABEL: s_minimum_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f32_e64 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s7 +; GFX10-NEXT: v_min_f32_e64 v0, s16, s17 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s17 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v0 @@ -782,14 +782,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX7-LABEL: s_minimum_v2f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s17 -; GFX7-NEXT: v_min_f32_e32 v1, s7, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s19 +; GFX7-NEXT: v_min_f32_e32 v1, s17, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v3, s6, v0 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX7-NEXT: v_min_f32_e32 v3, s16, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v[0:1] @@ -799,14 +799,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8-LABEL: s_minimum_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s17 -; GFX8-NEXT: v_min_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s19 +; GFX8-NEXT: v_min_f32_e32 v1, s17, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_min_f32_e32 v3, s6, v0 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX8-NEXT: v_min_f32_e32 v3, s16, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] @@ -816,14 +816,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX9-LABEL: s_minimum_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_min_f32_e32 v1, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s19 +; GFX9-NEXT: v_min_f32_e32 v1, s17, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s18 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_min_f32_e32 v3, s6, v0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX9-NEXT: v_min_f32_e32 v3, s16, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] @@ -851,11 +851,11 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX10-LABEL: s_minimum_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f32_e64 v0, s7, s17 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s7, s17 -; GFX10-NEXT: v_min_f32_e64 v2, s6, s16 +; GFX10-NEXT: v_min_f32_e64 v0, s17, s19 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s17, s19 +; GFX10-NEXT: v_min_f32_e64 v2, s16, s18 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s16 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s18 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index 81b892d424b46a..567582c9f58ff2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -427,10 +427,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX7-LABEL: s_minimum_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -442,10 +442,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX8-LABEL: s_minimum_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -457,10 +457,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX9-LABEL: s_minimum_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -487,8 +487,8 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX10-LABEL: s_minimum_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f64 v[0:1], s[6:7], s[16:17] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[6:7], s[16:17] +; GFX10-NEXT: v_min_f64 v[0:1], s[16:17], s[18:19] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[18:19] ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 ; GFX10-NEXT: ;;#ASMSTART @@ -844,14 +844,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX7-LABEL: s_minimum_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mov_b32_e32 v5, s19 -; GFX7-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX7-NEXT: v_min_f64 v[0:1], s[6:7], v[4:5] -; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-NEXT: v_mov_b32_e32 v5, s21 +; GFX7-NEXT: v_min_f64 v[2:3], s[18:19], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] +; GFX7-NEXT: v_min_f64 v[0:1], s[16:17], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -865,14 +865,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8-LABEL: s_minimum_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 -; GFX8-NEXT: v_mov_b32_e32 v1, s21 -; GFX8-NEXT: v_mov_b32_e32 v5, s19 -; GFX8-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], s[6:7], v[4:5] -; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NEXT: v_min_f64 v[2:3], s[18:19], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], s[16:17], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -886,14 +886,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX9-LABEL: s_minimum_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s20 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s21 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: v_min_f64 v[0:1], s[6:7], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_min_f64 v[2:3], s[18:19], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] +; GFX9-NEXT: v_min_f64 v[0:1], s[16:17], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -907,11 +907,11 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX940-LABEL: s_minimum_v2f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[18:19] ; GFX940-NEXT: v_min_f64 v[2:3], s[2:3], v[0:1] ; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] ; GFX940-NEXT: v_min_f64 v[4:5], s[0:1], v[0:1] ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -927,10 +927,10 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX10-LABEL: s_minimum_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f64 v[0:1], s[16:17], s[20:21] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[20:21] -; GFX10-NEXT: v_min_f64 v[4:5], s[6:7], s[18:19] -; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[6:7], s[18:19] +; GFX10-NEXT: v_min_f64 v[0:1], s[18:19], s[22:23] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[18:19], s[22:23] +; GFX10-NEXT: v_min_f64 v[4:5], s[16:17], s[20:21] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[16:17], s[20:21] ; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s5 @@ -943,10 +943,10 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX11-LABEL: s_minimum_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_f64 v[0:1], s[2:3], s[16:17] -; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[16:17] -; GFX11-NEXT: v_min_f64 v[4:5], s[0:1], s[6:7] -; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[6:7] +; GFX11-NEXT: v_min_f64 v[0:1], s[2:3], s[18:19] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[18:19] +; GFX11-NEXT: v_min_f64 v[4:5], s[0:1], s[16:17] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[16:17] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 @@ -964,8 +964,8 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[16:17] -; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[6:7] +; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[18:19] +; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[16:17] ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use v[0:3] ; GFX12-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 00cba8c77fc78f..8a2c6e2ad97e94 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -13,126 +13,126 @@ declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) define amdgpu_kernel void @minnum_f16_ieee( ; SI-LABEL: minnum_f16_ieee: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_f16_ieee: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_f16_ieee: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s14, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_f16_ieee: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s6 +; GFX10-NEXT: s_mov_b32 s15, s7 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s12, s6 -; GFX10-NEXT: s_mov_b32 s13, s7 +; GFX10-NEXT: s_mov_b32 s12, s2 +; GFX10-NEXT: s_mov_b32 s13, s3 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX10-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_f16_ieee: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 @@ -180,7 +180,7 @@ define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 { define amdgpu_kernel void @minnum_f16_imm_a( ; SI-LABEL: minnum_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -200,7 +200,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; VI-LABEL: minnum_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -219,45 +219,45 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; GFX9-LABEL: minnum_f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0 -; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -285,7 +285,7 @@ entry: define amdgpu_kernel void @minnum_f16_imm_b( ; SI-LABEL: minnum_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -305,7 +305,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; VI-LABEL: minnum_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -324,45 +324,45 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; GFX9-LABEL: minnum_f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0 -; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -390,37 +390,37 @@ entry: define amdgpu_kernel void @minnum_v2f16_ieee( ; SI-LABEL: minnum_v2f16_ieee: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[6:7], 0x0 -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s2, s[2:3], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s1, s2, 16 -; SI-NEXT: s_lshr_b32 s3, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: s_lshr_b32 s5, s2, 16 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_min_f32_e32 v1, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_v2f16_ieee: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s8, s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[8:9], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 @@ -439,46 +439,46 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 -; GFX9-NEXT: v_pk_max_f16 v1, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_ieee: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX10-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v2f16_ieee: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -537,7 +537,7 @@ define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) define amdgpu_kernel void @minnum_v2f16_imm_a( ; SI-LABEL: minnum_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -557,7 +557,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; VI-LABEL: minnum_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -575,34 +575,34 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; GFX9-LABEL: minnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s0, s0 -; GFX9-NEXT: s_mov_b32 s0, 0x44004200 -; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x44004200 +; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -624,7 +624,7 @@ entry: define amdgpu_kernel void @minnum_v2f16_imm_b( ; SI-LABEL: minnum_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -644,7 +644,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; VI-LABEL: minnum_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -662,34 +662,34 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; GFX9-LABEL: minnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s0, s0 -; GFX9-NEXT: s_mov_b32 s0, 0x42004400 -; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x42004400 +; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -712,22 +712,22 @@ entry: define amdgpu_kernel void @minnum_v3f16( ; SI-LABEL: minnum_v3f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; SI-NEXT: s_lshr_b32 s3, s2, 16 -; SI-NEXT: s_lshr_b32 s8, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: s_lshr_b32 s7, s6, 16 +; SI-NEXT: s_lshr_b32 s8, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 ; SI-NEXT: v_min_f32_e32 v1, v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_min_f32_e32 v2, v3, v4 @@ -736,18 +736,18 @@ define amdgpu_kernel void @minnum_v3f16( ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 -; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_v3f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 @@ -770,54 +770,54 @@ define amdgpu_kernel void @minnum_v3f16( ; ; GFX9-LABEL: minnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 -; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v2, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 -; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_min_f16 v1, v2, v1 ; GFX10-NEXT: v_pk_min_f16 v0, v3, v0 -; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 @@ -846,28 +846,28 @@ entry: define amdgpu_kernel void @minnum_v4f16( ; SI-LABEL: minnum_v4f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[10:11], 0x0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: s_lshr_b32 s2, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; SI-NEXT: s_lshr_b32 s2, s1, 16 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_lshr_b32 s6, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: s_lshr_b32 s6, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 ; SI-NEXT: v_min_f32_e32 v3, v3, v5 ; SI-NEXT: v_min_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -880,17 +880,17 @@ define amdgpu_kernel void @minnum_v4f16( ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 @@ -918,52 +918,52 @@ define amdgpu_kernel void @minnum_v4f16( ; ; GFX9-LABEL: minnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s9, s9 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v2, s8, s8 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v0, s5, s5 ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v2, s4, s4 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_min_f16 v1, v1, v0 ; GFX10-NEXT: v_pk_min_f16 v0, v3, v2 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 @@ -990,7 +990,7 @@ entry: define amdgpu_kernel void @fmin_v4f16_imm_a( ; SI-LABEL: fmin_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1019,7 +1019,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; VI-LABEL: fmin_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -1046,41 +1046,41 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; GFX9-LABEL: fmin_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 -; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 ; GFX9-NEXT: v_pk_min_f16 v1, v0, s8 ; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fmin_v4f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 -; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, v0 ; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fmin_v4f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index 7e3158bd1106ea..d46622ef45f435 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -332,7 +332,7 @@ bb: define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; SI-LABEL: umulo_i64_s: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 @@ -365,57 +365,57 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX9-LABEL: umulo_i64_s: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s4, s7 -; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6 -; GFX9-NEXT: s_mul_hi_u32 s1, s4, s7 -; GFX9-NEXT: s_add_u32 s9, s8, s3 -; GFX9-NEXT: s_mul_i32 s2, s5, s6 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s6 -; GFX9-NEXT: s_add_u32 s9, s9, s2 -; GFX9-NEXT: s_mul_hi_u32 s10, s5, s7 -; GFX9-NEXT: s_addc_u32 s0, s1, s0 -; GFX9-NEXT: s_addc_u32 s1, s10, 0 -; GFX9-NEXT: s_mul_i32 s5, s5, s7 -; GFX9-NEXT: s_add_u32 s0, s0, s5 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: s_add_i32 s3, s8, s3 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_mul_i32 s2, s4, s6 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cselect_b32 s0, 0, s3 -; GFX9-NEXT: s_cselect_b32 s1, 0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_mul_i32 s7, s0, s3 +; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3 +; GFX9-NEXT: s_add_u32 s9, s8, s7 +; GFX9-NEXT: s_mul_i32 s6, s1, s2 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2 +; GFX9-NEXT: s_add_u32 s9, s9, s6 +; GFX9-NEXT: s_mul_hi_u32 s10, s1, s3 +; GFX9-NEXT: s_addc_u32 s4, s5, s4 +; GFX9-NEXT: s_addc_u32 s5, s10, 0 +; GFX9-NEXT: s_mul_i32 s1, s1, s3 +; GFX9-NEXT: s_add_u32 s4, s4, s1 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_i32 s1, s8, s7 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_mul_i32 s0, s0, s2 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_cselect_b32 s1, 0, s1 +; GFX9-NEXT: s_cselect_b32 s0, 0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: umulo_i64_s: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s3, s4, s7 -; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6 -; GFX10-NEXT: s_mul_hi_u32 s1, s4, s7 -; GFX10-NEXT: s_mul_hi_u32 s0, s5, s6 -; GFX10-NEXT: s_mul_i32 s2, s5, s6 -; GFX10-NEXT: s_mul_hi_u32 s9, s5, s7 -; GFX10-NEXT: s_mul_i32 s5, s5, s7 -; GFX10-NEXT: s_add_u32 s7, s8, s3 -; GFX10-NEXT: s_addc_u32 s1, 0, s1 -; GFX10-NEXT: s_add_u32 s7, s7, s2 -; GFX10-NEXT: s_addc_u32 s0, s1, s0 -; GFX10-NEXT: s_addc_u32 s1, s9, 0 -; GFX10-NEXT: s_add_u32 s0, s0, s5 -; GFX10-NEXT: s_addc_u32 s1, 0, s1 -; GFX10-NEXT: s_add_i32 s3, s8, s3 -; GFX10-NEXT: s_mul_i32 s4, s4, s6 -; GFX10-NEXT: s_add_i32 s3, s3, s2 -; GFX10-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX10-NEXT: s_cselect_b32 s0, 0, s4 -; GFX10-NEXT: s_cselect_b32 s1, 0, s3 +; GFX10-NEXT: s_mul_i32 s7, s0, s3 +; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 +; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3 +; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2 +; GFX10-NEXT: s_mul_i32 s6, s1, s2 +; GFX10-NEXT: s_mul_hi_u32 s9, s1, s3 +; GFX10-NEXT: s_mul_i32 s1, s1, s3 +; GFX10-NEXT: s_add_u32 s3, s8, s7 +; GFX10-NEXT: s_addc_u32 s5, 0, s5 +; GFX10-NEXT: s_add_u32 s3, s3, s6 +; GFX10-NEXT: s_addc_u32 s3, s5, s4 +; GFX10-NEXT: s_addc_u32 s5, s9, 0 +; GFX10-NEXT: s_add_u32 s4, s3, s1 +; GFX10-NEXT: s_addc_u32 s5, 0, s5 +; GFX10-NEXT: s_add_i32 s1, s8, s7 +; GFX10-NEXT: s_mul_i32 s0, s0, s2 +; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX10-NEXT: s_cselect_b32 s0, 0, s0 +; GFX10-NEXT: s_cselect_b32 s1, 0, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off @@ -423,7 +423,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX11-LABEL: umulo_i64_s: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s7, s0, s3 ; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -452,7 +452,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX12-LABEL: umulo_i64_s: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 @@ -487,7 +487,7 @@ bb: define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; SI-LABEL: smulo_i64_s: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 @@ -536,81 +536,81 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX9-LABEL: smulo_i64_s: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s4, s7 -; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6 -; GFX9-NEXT: s_mul_hi_u32 s1, s4, s7 -; GFX9-NEXT: s_add_u32 s9, s8, s3 -; GFX9-NEXT: s_mul_i32 s2, s5, s6 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s6 -; GFX9-NEXT: s_add_u32 s9, s9, s2 -; GFX9-NEXT: s_mul_hi_i32 s10, s5, s7 -; GFX9-NEXT: s_addc_u32 s0, s1, s0 -; GFX9-NEXT: s_addc_u32 s1, s10, 0 -; GFX9-NEXT: s_mul_i32 s9, s5, s7 -; GFX9-NEXT: s_add_u32 s0, s0, s9 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: s_sub_u32 s9, s0, s6 -; GFX9-NEXT: s_subb_u32 s10, s1, 0 -; GFX9-NEXT: s_cmp_lt_i32 s5, 0 -; GFX9-NEXT: s_cselect_b32 s0, s9, s0 -; GFX9-NEXT: s_cselect_b32 s1, s10, s1 -; GFX9-NEXT: s_sub_u32 s5, s0, s4 -; GFX9-NEXT: s_subb_u32 s9, s1, 0 -; GFX9-NEXT: s_cmp_lt_i32 s7, 0 -; GFX9-NEXT: s_cselect_b32 s1, s9, s1 -; GFX9-NEXT: s_cselect_b32 s0, s5, s0 -; GFX9-NEXT: s_add_i32 s3, s8, s3 -; GFX9-NEXT: s_add_i32 s5, s3, s2 -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_mul_i32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], s[2:3] -; GFX9-NEXT: s_cselect_b32 s0, 0, s5 -; GFX9-NEXT: s_cselect_b32 s1, 0, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_mul_i32 s7, s0, s3 +; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3 +; GFX9-NEXT: s_add_u32 s9, s8, s7 +; GFX9-NEXT: s_mul_i32 s6, s1, s2 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2 +; GFX9-NEXT: s_add_u32 s9, s9, s6 +; GFX9-NEXT: s_mul_hi_i32 s10, s1, s3 +; GFX9-NEXT: s_addc_u32 s4, s5, s4 +; GFX9-NEXT: s_addc_u32 s5, s10, 0 +; GFX9-NEXT: s_mul_i32 s9, s1, s3 +; GFX9-NEXT: s_add_u32 s4, s4, s9 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_sub_u32 s9, s4, s2 +; GFX9-NEXT: s_subb_u32 s10, s5, 0 +; GFX9-NEXT: s_cmp_lt_i32 s1, 0 +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 +; GFX9-NEXT: s_cselect_b32 s1, s10, s5 +; GFX9-NEXT: s_sub_u32 s9, s4, s0 +; GFX9-NEXT: s_subb_u32 s5, s1, 0 +; GFX9-NEXT: s_cmp_lt_i32 s3, 0 +; GFX9-NEXT: s_cselect_b32 s5, s5, s1 +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 +; GFX9-NEXT: s_add_i32 s1, s8, s7 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_mov_b32 s7, s6 +; GFX9-NEXT: s_mul_i32 s0, s0, s2 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], s[6:7] +; GFX9-NEXT: s_cselect_b32 s1, 0, s1 +; GFX9-NEXT: s_cselect_b32 s0, 0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: smulo_i64_s: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s3, s4, s7 -; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6 -; GFX10-NEXT: s_mul_hi_u32 s1, s4, s7 -; GFX10-NEXT: s_mul_i32 s2, s5, s6 -; GFX10-NEXT: s_add_u32 s11, s8, s3 -; GFX10-NEXT: s_mul_hi_u32 s0, s5, s6 -; GFX10-NEXT: s_addc_u32 s1, 0, s1 -; GFX10-NEXT: s_mul_hi_i32 s9, s5, s7 -; GFX10-NEXT: s_add_u32 s11, s11, s2 -; GFX10-NEXT: s_mul_i32 s10, s5, s7 -; GFX10-NEXT: s_addc_u32 s0, s1, s0 -; GFX10-NEXT: s_addc_u32 s1, s9, 0 -; GFX10-NEXT: s_add_u32 s0, s0, s10 -; GFX10-NEXT: s_addc_u32 s1, 0, s1 -; GFX10-NEXT: s_sub_u32 s9, s0, s6 -; GFX10-NEXT: s_subb_u32 s10, s1, 0 -; GFX10-NEXT: s_cmp_lt_i32 s5, 0 -; GFX10-NEXT: s_cselect_b32 s0, s9, s0 -; GFX10-NEXT: s_cselect_b32 s1, s10, s1 -; GFX10-NEXT: s_sub_u32 s5, s0, s4 -; GFX10-NEXT: s_subb_u32 s9, s1, 0 -; GFX10-NEXT: s_cmp_lt_i32 s7, 0 -; GFX10-NEXT: s_mul_i32 s4, s4, s6 -; GFX10-NEXT: s_cselect_b32 s1, s9, s1 -; GFX10-NEXT: s_cselect_b32 s0, s5, s0 -; GFX10-NEXT: s_add_i32 s3, s8, s3 -; GFX10-NEXT: s_add_i32 s5, s3, s2 -; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_cmp_lg_u64 s[0:1], s[2:3] -; GFX10-NEXT: s_cselect_b32 s0, 0, s4 -; GFX10-NEXT: s_cselect_b32 s1, 0, s5 +; GFX10-NEXT: s_mul_i32 s7, s0, s3 +; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 +; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3 +; GFX10-NEXT: s_mul_i32 s6, s1, s2 +; GFX10-NEXT: s_add_u32 s11, s8, s7 +; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2 +; GFX10-NEXT: s_addc_u32 s5, 0, s5 +; GFX10-NEXT: s_mul_hi_i32 s9, s1, s3 +; GFX10-NEXT: s_add_u32 s11, s11, s6 +; GFX10-NEXT: s_mul_i32 s10, s1, s3 +; GFX10-NEXT: s_addc_u32 s4, s5, s4 +; GFX10-NEXT: s_addc_u32 s5, s9, 0 +; GFX10-NEXT: s_add_u32 s4, s4, s10 +; GFX10-NEXT: s_addc_u32 s5, 0, s5 +; GFX10-NEXT: s_sub_u32 s9, s4, s2 +; GFX10-NEXT: s_subb_u32 s10, s5, 0 +; GFX10-NEXT: s_cmp_lt_i32 s1, 0 +; GFX10-NEXT: s_cselect_b32 s1, s9, s4 +; GFX10-NEXT: s_cselect_b32 s4, s10, s5 +; GFX10-NEXT: s_sub_u32 s9, s1, s0 +; GFX10-NEXT: s_subb_u32 s5, s4, 0 +; GFX10-NEXT: s_cmp_lt_i32 s3, 0 +; GFX10-NEXT: s_mul_i32 s0, s0, s2 +; GFX10-NEXT: s_cselect_b32 s5, s5, s4 +; GFX10-NEXT: s_cselect_b32 s4, s9, s1 +; GFX10-NEXT: s_add_i32 s1, s8, s7 +; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_mov_b32 s7, s6 +; GFX10-NEXT: s_cmp_lg_u64 s[4:5], s[6:7] +; GFX10-NEXT: s_cselect_b32 s0, 0, s0 +; GFX10-NEXT: s_cselect_b32 s1, 0, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off @@ -618,7 +618,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX11-LABEL: smulo_i64_s: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s7, s0, s3 ; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -661,7 +661,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX12-LABEL: smulo_i64_s: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll index 3d73f84b6e9a80..08953caee405c5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -6,19 +6,19 @@ define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { ; SI-LABEL: local_size_x: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_size_x: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x18 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x18 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -45,19 +45,19 @@ entry: define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { ; SI-LABEL: local_size_y: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_size_y: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x1c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -84,19 +84,19 @@ entry: define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { ; SI-LABEL: local_size_z: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_size_z: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -123,11 +123,11 @@ entry: define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xy: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_mul_i32 s4, s6, s7 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -135,8 +135,8 @@ define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xy: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x18 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mul_i32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -166,12 +166,12 @@ entry: define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x6 -; SI-NEXT: s_load_dword s5, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0x6 +; SI-NEXT: s_load_dword s6, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_mul_i32 s4, s2, s6 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -179,11 +179,11 @@ define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x18 -; VI-NEXT: s_load_dword s5, s[2:3], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x18 +; VI-NEXT: s_load_dword s3, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s2, s4, s5 +; VI-NEXT: s_mul_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -211,7 +211,7 @@ entry: define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_yz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x7 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mul_i32 s0, s0, s1 @@ -224,7 +224,7 @@ define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_yz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x1c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mul_i32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -254,13 +254,13 @@ entry: define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xyz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x6 -; SI-NEXT: s_load_dword s6, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x6 +; SI-NEXT: s_load_dword s2, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s2, s4, s5 -; SI-NEXT: s_add_i32 s4, s2, s6 +; SI-NEXT: s_mul_i32 s4, s6, s7 +; SI-NEXT: s_add_i32 s4, s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -268,12 +268,12 @@ define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xyz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 -; VI-NEXT: s_load_dword s4, s[2:3], 0x20 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x18 +; VI-NEXT: s_load_dword s6, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: s_add_i32 s0, s0, s4 +; VI-NEXT: s_add_i32 s0, s0, s6 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -304,19 +304,19 @@ entry: define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_x_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_size_x_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x18 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x18 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -345,19 +345,19 @@ entry: define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_y_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_size_y_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x1c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -386,19 +386,19 @@ entry: define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_z_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: local_size_z_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 4fc401ff20ac3b..4de0c548ad381c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -10,7 +10,7 @@ declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) define amdgpu_kernel void @rint_f16( ; SI-LABEL: rint_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -28,45 +28,27 @@ define amdgpu_kernel void @rint_f16( ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-LABEL: rint_f16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_rndne_f16_e32 v0, v0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: rint_f16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_rndne_f16_e32 v0, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: rint_f16: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_rndne_f16_e32 v0, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: rint_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -101,7 +83,7 @@ entry: define amdgpu_kernel void @rint_v2f16( ; SI-LABEL: rint_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -127,7 +109,7 @@ define amdgpu_kernel void @rint_v2f16( ; ; VI-LABEL: rint_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -147,27 +129,27 @@ define amdgpu_kernel void @rint_v2f16( ; ; GFX9-LABEL: rint_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rndne_f16_e32 v1, v0 ; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: rint_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -194,5 +176,3 @@ entry: store <2 x half> %r.val, ptr addrspace(1) %r ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX89: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index fc962b1b4a377f..c735854a455905 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; SI-LABEL: round_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s5, 0xfffff ; SI-NEXT: s_mov_b32 s4, s6 @@ -41,7 +41,7 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; ; CI-LABEL: round_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_brev_b32 s5, -2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 @@ -68,7 +68,7 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_round_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -108,7 +108,7 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_round_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -141,85 +141,85 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) #0 { ; SI-LABEL: round_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s7, 0xfffff +; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 -; SI-NEXT: s_add_i32 s12, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s12 -; SI-NEXT: s_and_b32 s7, s11, 0x80000000 -; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] +; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 +; SI-NEXT: s_add_i32 s12, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 +; SI-NEXT: s_and_b32 s3, s11, 0x80000000 +; SI-NEXT: s_andn2_b64 s[0:1], s[10:11], s[0:1] ; SI-NEXT: s_cmp_lt_i32 s12, 0 -; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s7, s5 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cselect_b32 s1, s3, s1 ; SI-NEXT: s_cmp_gt_i32 s12, 51 -; SI-NEXT: s_cselect_b32 s12, s10, s4 -; SI-NEXT: s_cselect_b32 s13, s11, s5 +; SI-NEXT: s_cselect_b32 s12, s10, s0 +; SI-NEXT: s_cselect_b32 s13, s11, s1 ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 -; SI-NEXT: s_brev_b32 s7, -2 -; SI-NEXT: s_and_b64 s[2:3], s[14:15], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s2, 0xfc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; SI-NEXT: s_andn2_b64 s[0:1], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s3, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s2, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s3, s1 -; SI-NEXT: s_cmp_gt_i32 s2, 51 -; SI-NEXT: s_cselect_b32 s1, s9, s1 -; SI-NEXT: s_cselect_b32 s0, s8, s0 -; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: s_brev_b32 s10, -2 +; SI-NEXT: s_and_b64 s[4:5], s[14:15], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 +; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] +; SI-NEXT: s_and_b32 s6, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s4, 0, s4 +; SI-NEXT: s_cselect_b32 s5, s6, s5 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s4, s8, s4 +; SI-NEXT: s_cselect_b32 s5, s9, s5 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: v_add_f64 v[2:3], s[8:9], -v[2:3] ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[2:3]|, 0.5 -; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 +; SI-NEXT: v_bfi_b32 v1, s10, v0, v1 +; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[2:3], s[12:13], v[0:1] -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_bfi_b32 v1, s7, v1, v4 -; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_bfi_b32 v1, s10, v1, v4 +; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[2:3], s[6:7] -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] -; CI-NEXT: v_add_f64 v[4:5], s[6:7], -v[2:3] -; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[4:5]|, 0.5 -; CI-NEXT: v_add_f64 v[4:5], s[4:5], -v[6:7] -; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[4:5]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[2:3], s[10:11] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] +; CI-NEXT: v_add_f64 v[4:5], s[10:11], -v[2:3] +; CI-NEXT: v_mov_b32_e32 v1, s11 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 +; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[6:7] +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 ; CI-NEXT: v_bfi_b32 v1, s2, v8, v1 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v1, s4 -; CI-NEXT: v_mov_b32_e32 v4, s5 +; CI-NEXT: v_mov_b32_e32 v4, s9 ; CI-NEXT: v_bfi_b32 v1, s2, v1, v4 ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[0:1] ; CI-NEXT: s_mov_b32 s2, -1 @@ -233,151 +233,151 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) #0 { ; SI-LABEL: round_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s14 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s7, 0xfffff +; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s12, s7, 0xb0014 -; SI-NEXT: s_add_i32 s16, s12, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[12:13], s[0:1], s16 -; SI-NEXT: s_and_b32 s15, s7, 0x80000000 -; SI-NEXT: s_andn2_b64 s[12:13], s[6:7], s[12:13] +; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 +; SI-NEXT: s_add_i32 s16, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s16 +; SI-NEXT: s_and_b32 s3, s11, 0x80000000 +; SI-NEXT: s_andn2_b64 s[0:1], s[10:11], s[0:1] ; SI-NEXT: s_cmp_lt_i32 s16, 0 -; SI-NEXT: s_cselect_b32 s12, 0, s12 -; SI-NEXT: s_cselect_b32 s13, s15, s13 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cselect_b32 s1, s3, s1 ; SI-NEXT: s_cmp_gt_i32 s16, 51 -; SI-NEXT: s_cselect_b32 s16, s6, s12 -; SI-NEXT: s_cselect_b32 s17, s7, s13 +; SI-NEXT: s_cselect_b32 s16, s10, s0 +; SI-NEXT: s_cselect_b32 s17, s11, s1 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_cmp_ge_f64_e64 s[18:19], |v[0:1]|, 0.5 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b64 s[2:3], s[18:19], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 -; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[0:1], s6 -; SI-NEXT: s_andn2_b64 s[2:3], s[4:5], s[2:3] -; SI-NEXT: s_and_b32 s7, s5, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s6, 0 -; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: s_cselect_b32 s3, s7, s3 -; SI-NEXT: s_cmp_gt_i32 s6, 51 -; SI-NEXT: s_brev_b32 s15, -2 -; SI-NEXT: s_cselect_b32 s2, s4, s2 -; SI-NEXT: v_bfi_b32 v5, s15, v0, v1 -; SI-NEXT: s_cselect_b32 s3, s5, s3 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[16:17], v[4:5] -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[0:1]|, 0.5 -; SI-NEXT: v_mov_b32_e32 v6, s5 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 -; SI-NEXT: s_add_i32 s6, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s6 -; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] -; SI-NEXT: s_and_b32 s7, s11, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: s_and_b64 s[4:5], s[18:19], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 +; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] +; SI-NEXT: s_and_b32 s10, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s7, s5 -; SI-NEXT: s_cmp_gt_i32 s6, 51 -; SI-NEXT: s_cselect_b32 s4, s10, s4 -; SI-NEXT: s_cselect_b32 s5, s11, s5 +; SI-NEXT: s_cselect_b32 s5, s10, s5 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_brev_b32 s18, -2 +; SI-NEXT: s_cselect_b32 s4, s8, s4 +; SI-NEXT: v_bfi_b32 v5, s18, v0, v1 +; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] -; SI-NEXT: v_bfi_b32 v5, s15, v5, v6 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[0:1]|, 0.5 -; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5] -; SI-NEXT: s_and_b64 s[2:3], s[6:7], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v8, s2 -; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s2, 0xfc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; SI-NEXT: s_andn2_b64 s[0:1], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s3, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s2, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s3, s1 -; SI-NEXT: s_cmp_gt_i32 s2, 51 -; SI-NEXT: s_cselect_b32 s1, s9, s1 -; SI-NEXT: s_cselect_b32 s0, s8, s0 -; SI-NEXT: v_mov_b32_e32 v6, s1 -; SI-NEXT: v_mov_b32_e32 v5, s0 -; SI-NEXT: v_add_f64 v[6:7], s[8:9], -v[5:6] -; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5 -; SI-NEXT: v_bfi_b32 v5, s15, v8, v9 -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[4:5] -; SI-NEXT: v_mov_b32_e32 v5, s2 -; SI-NEXT: v_mov_b32_e32 v8, s9 -; SI-NEXT: v_bfi_b32 v5, s15, v5, v8 -; SI-NEXT: v_add_f64 v[4:5], s[0:1], v[4:5] -; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] +; SI-NEXT: v_add_f64 v[2:3], s[16:17], v[4:5] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s3 +; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[8:9] +; SI-NEXT: s_and_b32 s10, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s8, 0, s8 +; SI-NEXT: s_cselect_b32 s9, s10, s9 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s8, s14, s8 +; SI-NEXT: s_cselect_b32 s9, s15, s9 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] +; SI-NEXT: v_bfi_b32 v5, s18, v5, v6 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[4:5] +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v8, s3 +; SI-NEXT: s_bfe_u32 s3, s13, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 +; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[4:5] +; SI-NEXT: s_and_b32 s6, s13, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s4, 0, s4 +; SI-NEXT: s_cselect_b32 s5, s6, s5 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s5, s13, s5 +; SI-NEXT: s_cselect_b32 s4, s12, s4 +; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: v_add_f64 v[6:7], s[12:13], -v[5:6] +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[6:7]|, 0.5 +; SI-NEXT: v_bfi_b32 v5, s18, v8, v9 +; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[4:5] +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v8, s13 +; SI-NEXT: v_bfi_b32 v5, s18, v5, v8 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[4:5] +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; CI-NEXT: s_brev_b32 s14, -2 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; CI-NEXT: s_mov_b32 s15, 0xf000 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] -; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] -; CI-NEXT: v_mov_b32_e32 v5, s7 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 -; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[6:7] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 -; CI-NEXT: v_bfi_b32 v5, s14, v8, v5 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[10:11] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] +; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s11 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 +; CI-NEXT: v_add_f64 v[2:3], s[8:9], -v[6:7] +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v8, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v10, s5 -; CI-NEXT: v_bfi_b32 v5, s14, v5, v10 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v10, s9 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[12:13] ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_add_f64 v[6:7], s[8:9], -v[10:11] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[6:7]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v12, s11 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_bfi_b32 v5, s14, v5, v12 -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_add_f64 v[6:7], s[12:13], -v[10:11] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[6:7]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v12, s15 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v8, s9 -; CI-NEXT: v_bfi_b32 v5, s14, v5, v8 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v8, s13 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v8 ; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] -; CI-NEXT: s_mov_b32 s14, -1 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 store <4 x double> %result, ptr addrspace(1) %out @@ -387,275 +387,275 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) #0 { ; SI-LABEL: round_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s22 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s7, 0xfffff +; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: v_mov_b32_e32 v8, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s20, s7, 0xb0014 -; SI-NEXT: s_add_i32 s24, s20, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[20:21], s[0:1], s24 -; SI-NEXT: s_and_b32 s23, s7, 0x80000000 -; SI-NEXT: s_andn2_b64 s[20:21], s[6:7], s[20:21] +; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 +; SI-NEXT: s_add_i32 s24, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s24 +; SI-NEXT: s_and_b32 s3, s11, 0x80000000 +; SI-NEXT: s_andn2_b64 s[0:1], s[10:11], s[0:1] ; SI-NEXT: s_cmp_lt_i32 s24, 0 -; SI-NEXT: s_cselect_b32 s20, 0, s20 -; SI-NEXT: s_cselect_b32 s21, s23, s21 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cselect_b32 s1, s3, s1 ; SI-NEXT: s_cmp_gt_i32 s24, 51 -; SI-NEXT: s_cselect_b32 s24, s6, s20 -; SI-NEXT: s_cselect_b32 s25, s7, s21 +; SI-NEXT: s_cselect_b32 s24, s10, s0 +; SI-NEXT: s_cselect_b32 s25, s11, s1 ; SI-NEXT: v_mov_b32_e32 v0, s24 ; SI-NEXT: v_mov_b32_e32 v1, s25 -; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b64 s[2:3], s[26:27], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 -; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] -; SI-NEXT: s_and_b32 s23, s5, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s23, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_cselect_b32 s6, s4, s6 -; SI-NEXT: v_bfi_b32 v9, s2, v0, v1 -; SI-NEXT: s_cselect_b32 s7, s5, s7 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[24:25], v[8:9] +; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 -; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: s_and_b64 s[24:25], s[26:27], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: s_bfe_u32 s3, s11, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s3 -; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] -; SI-NEXT: s_and_b32 s23, s11, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: s_and_b64 s[4:5], s[26:27], exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 +; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 +; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] +; SI-NEXT: s_and_b32 s11, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s23, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s4, s10, s4 ; SI-NEXT: s_cselect_b32 s5, s11, s5 +; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_brev_b32 s3, -2 +; SI-NEXT: s_cselect_b32 s4, s8, s4 +; SI-NEXT: v_bfi_b32 v9, s3, v0, v1 +; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] -; SI-NEXT: v_bfi_b32 v9, s2, v4, v5 -; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 -; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[8:9] -; SI-NEXT: s_and_b64 s[6:7], s[24:25], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v6, s3 -; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 -; SI-NEXT: s_andn2_b64 s[6:7], s[8:9], s[6:7] -; SI-NEXT: s_and_b32 s10, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s10, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s6, s8, s6 -; SI-NEXT: s_cselect_b32 s7, s9, s7 -; SI-NEXT: v_mov_b32_e32 v4, s6 -; SI-NEXT: v_mov_b32_e32 v5, s7 -; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[4:5] -; SI-NEXT: v_mov_b32_e32 v7, s11 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v6, v7 -; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[8:9] +; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: v_add_f64 v[2:3], s[24:25], v[8:9] +; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_bfe_u32 s8, s15, 0xb0014 +; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s10 +; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[8:9] +; SI-NEXT: s_and_b32 s11, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_cselect_b32 s8, 0, s8 +; SI-NEXT: s_cselect_b32 s9, s11, s9 +; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cselect_b32 s8, s14, s8 +; SI-NEXT: s_cselect_b32 s9, s15, s9 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] +; SI-NEXT: v_bfi_b32 v9, s3, v4, v5 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[8:9] ; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s3 -; SI-NEXT: s_andn2_b64 s[4:5], s[14:15], s[4:5] -; SI-NEXT: s_and_b32 s8, s15, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 +; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 +; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[4:5] +; SI-NEXT: s_and_b32 s11, s13, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s8, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s4, s14, s4 -; SI-NEXT: s_cselect_b32 s5, s15, s5 +; SI-NEXT: s_cselect_b32 s5, s11, s5 +; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cselect_b32 s4, s12, s4 +; SI-NEXT: s_cselect_b32 s5, s13, s5 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: v_add_f64 v[4:5], s[14:15], -v[4:5] -; SI-NEXT: v_mov_b32_e32 v10, s9 -; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[4:5]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v9, v10 -; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[8:9] -; SI-NEXT: s_and_b64 s[6:7], s[8:9], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v12, s3 -; SI-NEXT: s_bfe_u32 s3, s13, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 -; SI-NEXT: s_andn2_b64 s[6:7], s[12:13], s[6:7] -; SI-NEXT: s_and_b32 s8, s13, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s8, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s7, s13, s7 -; SI-NEXT: s_cselect_b32 s6, s12, s6 -; SI-NEXT: v_mov_b32_e32 v10, s7 -; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[9:10] -; SI-NEXT: v_mov_b32_e32 v13, s15 -; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v12, v13 -; SI-NEXT: v_add_f64 v[12:13], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], s[8:9], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v14, s3 -; SI-NEXT: s_bfe_u32 s3, s19, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s3 -; SI-NEXT: s_andn2_b64 s[4:5], s[18:19], s[4:5] -; SI-NEXT: s_and_b32 s8, s19, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: v_add_f64 v[4:5], s[12:13], -v[4:5] +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s3, v6, v7 +; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[8:9] +; SI-NEXT: s_and_b64 s[8:9], s[10:11], exec +; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 +; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s10 +; SI-NEXT: s_andn2_b64 s[8:9], s[18:19], s[8:9] +; SI-NEXT: s_and_b32 s11, s19, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_cselect_b32 s8, 0, s8 +; SI-NEXT: s_cselect_b32 s9, s11, s9 +; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cselect_b32 s8, s18, s8 +; SI-NEXT: s_cselect_b32 s9, s19, s9 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_add_f64 v[4:5], s[18:19], -v[4:5] +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s3, v9, v10 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[8:9] +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_bfe_u32 s4, s17, 0xb0014 +; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 +; SI-NEXT: s_andn2_b64 s[4:5], s[16:17], s[4:5] +; SI-NEXT: s_and_b32 s11, s17, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s8, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s5, s19, s5 -; SI-NEXT: s_cselect_b32 s4, s18, s4 +; SI-NEXT: s_cselect_b32 s5, s11, s5 +; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cselect_b32 s5, s17, s5 +; SI-NEXT: s_cselect_b32 s4, s16, s4 ; SI-NEXT: v_mov_b32_e32 v10, s5 ; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: v_add_f64 v[10:11], s[18:19], -v[9:10] -; SI-NEXT: v_mov_b32_e32 v15, s13 -; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v14, v15 -; SI-NEXT: v_add_f64 v[10:11], s[6:7], v[8:9] -; SI-NEXT: s_and_b64 s[6:7], s[8:9], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s17, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 -; SI-NEXT: s_andn2_b64 s[0:1], s[16:17], s[0:1] -; SI-NEXT: s_and_b32 s6, s17, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s6, s1 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s1, s17, s1 -; SI-NEXT: s_cselect_b32 s0, s16, s0 -; SI-NEXT: v_mov_b32_e32 v15, s1 -; SI-NEXT: v_mov_b32_e32 v14, s0 -; SI-NEXT: v_add_f64 v[14:15], s[16:17], -v[14:15] -; SI-NEXT: v_mov_b32_e32 v16, s19 +; SI-NEXT: v_add_f64 v[10:11], s[16:17], -v[9:10] +; SI-NEXT: v_mov_b32_e32 v13, s19 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s3, v12, v13 +; SI-NEXT: v_add_f64 v[12:13], s[8:9], v[8:9] +; SI-NEXT: s_and_b64 s[8:9], s[10:11], exec +; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: s_bfe_u32 s8, s23, 0xb0014 +; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s10 +; SI-NEXT: s_andn2_b64 s[8:9], s[22:23], s[8:9] +; SI-NEXT: s_and_b32 s11, s23, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_cselect_b32 s8, 0, s8 +; SI-NEXT: s_cselect_b32 s9, s11, s9 +; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cselect_b32 s9, s23, s9 +; SI-NEXT: s_cselect_b32 s8, s22, s8 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: v_add_f64 v[10:11], s[22:23], -v[9:10] +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s3, v14, v15 +; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[8:9] +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_bfe_u32 s4, s21, 0xb0014 +; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 +; SI-NEXT: s_andn2_b64 s[4:5], s[20:21], s[4:5] +; SI-NEXT: s_and_b32 s6, s21, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_cselect_b32 s4, 0, s4 +; SI-NEXT: s_cselect_b32 s5, s6, s5 +; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cselect_b32 s5, s21, s5 +; SI-NEXT: s_cselect_b32 s4, s20, s4 +; SI-NEXT: v_mov_b32_e32 v15, s5 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_add_f64 v[14:15], s[20:21], -v[14:15] +; SI-NEXT: v_mov_b32_e32 v16, s23 ; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[14:15]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v9, v16 -; SI-NEXT: v_add_f64 v[16:17], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], s[6:7], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: v_mov_b32_e32 v14, s17 -; SI-NEXT: v_bfi_b32 v9, s2, v9, v14 -; SI-NEXT: v_add_f64 v[14:15], s[0:1], v[8:9] -; SI-NEXT: s_mov_b32 s23, 0xf000 +; SI-NEXT: v_bfi_b32 v9, s3, v9, v16 +; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_add_f64 v[16:17], s[8:9], v[8:9] +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: v_mov_b32_e32 v14, s21 +; SI-NEXT: v_bfi_b32 v9, s3, v9, v14 +; SI-NEXT: v_add_f64 v[14:15], s[4:5], v[8:9] +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 -; SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; CI-NEXT: s_brev_b32 s22, -2 +; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; CI-NEXT: s_mov_b32 s23, 0xf000 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] -; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] -; CI-NEXT: v_mov_b32_e32 v5, s7 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 -; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[6:7] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 -; CI-NEXT: v_bfi_b32 v5, s22, v8, v5 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[10:11] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] +; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s11 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 +; CI-NEXT: v_add_f64 v[2:3], s[8:9], -v[6:7] +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v8, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v10, s5 -; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[8:9] -; CI-NEXT: v_bfi_b32 v5, s22, v5, v10 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v10, s9 +; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] +; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_add_f64 v[10:11], s[8:9], -v[6:7] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[14:15] -; CI-NEXT: v_mov_b32_e32 v12, s11 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_bfi_b32 v5, s22, v5, v12 -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[12:13], s[14:15], -v[10:11] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[12:13] +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_add_f64 v[10:11], s[12:13], -v[6:7] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19] +; CI-NEXT: v_mov_b32_e32 v12, s15 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_add_f64 v[12:13], s[18:19], -v[10:11] ; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v14, s9 -; CI-NEXT: v_bfi_b32 v5, s22, v5, v14 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[14:15], s[12:13] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_add_f64 v[12:13], s[12:13], -v[14:15] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v14, s13 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v14 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_add_f64 v[12:13], s[16:17], -v[14:15] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v16, s15 -; CI-NEXT: v_bfi_b32 v5, s22, v5, v16 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v16, s19 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v16 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_trunc_f64_e32 v[16:17], s[22:23] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v18, s13 -; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[16:17] -; CI-NEXT: v_bfi_b32 v5, s22, v5, v18 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v18, s17 +; CI-NEXT: v_add_f64 v[10:11], s[22:23], -v[16:17] +; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 ; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] -; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_add_f64 v[18:19], s[16:17], -v[14:15] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v20, s19 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_bfi_b32 v5, s22, v5, v20 -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_trunc_f64_e32 v[14:15], s[20:21] +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_add_f64 v[18:19], s[20:21], -v[14:15] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[18:19]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v20, s23 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_bfi_b32 v5, s2, v5, v20 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v18, s17 -; CI-NEXT: v_bfi_b32 v5, s22, v5, v18 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v18, s21 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 ; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] -; CI-NEXT: s_mov_b32 s22, -1 -; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 store <8 x double> %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index 5347f0f00c3157..c0a85bba93b738 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-LABEL: round_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -26,8 +26,8 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; ; GFX8-LABEL: round_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -44,8 +44,8 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; ; GFX9-LABEL: round_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -63,18 +63,18 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX11-LABEL: round_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f32_e32 v0, s4 +; GFX11-NEXT: v_trunc_f32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v1, s4, v0 -; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v1|, 0.5 +; GFX11-NEXT: v_sub_f32_e32 v1, s2, v0 +; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -107,7 +107,7 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #0 { ; GFX6-LABEL: round_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_brev_b32 s8, -2 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -133,7 +133,7 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; GFX8-LABEL: round_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_brev_b32 s8, -2 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 @@ -159,33 +159,33 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; GFX9-LABEL: round_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_brev_b32 s8, -2 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f32_e32 v0, s7 -; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_trunc_f32_e32 v0, s3 +; GFX9-NEXT: v_sub_f32_e32 v1, s3, v0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2 ; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s6 -; GFX9-NEXT: v_sub_f32_e32 v2, s6, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_trunc_f32_e32 v0, s2 +; GFX9-NEXT: v_sub_f32_e32 v2, s2, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s3 ; GFX11-NEXT: v_trunc_f32_e32 v2, s2 @@ -238,157 +238,157 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #0 { ; GFX6-LABEL: round_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GFX6-NEXT: s_brev_b32 s10, -2 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_trunc_f32_e32 v0, s7 -; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX6-NEXT: v_trunc_f32_e32 v0, s3 +; GFX6-NEXT: v_sub_f32_e32 v1, s3, v0 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s3 ; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX6-NEXT: v_trunc_f32_e32 v0, s6 -; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX6-NEXT: v_trunc_f32_e32 v0, s2 +; GFX6-NEXT: v_sub_f32_e32 v1, s2, v0 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX6-NEXT: v_trunc_f32_e32 v0, s5 -; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v4, s5 +; GFX6-NEXT: v_trunc_f32_e32 v0, s1 +; GFX6-NEXT: v_sub_f32_e32 v1, s1, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v4, s1 ; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v4 ; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX6-NEXT: v_trunc_f32_e32 v0, s4 -; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v5, s4 +; GFX6-NEXT: v_trunc_f32_e32 v0, s0 +; GFX6-NEXT: v_sub_f32_e32 v4, s0, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v5, s0 ; GFX6-NEXT: v_bfi_b32 v4, s10, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: round_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_brev_b32 s10, -2 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_trunc_f32_e32 v0, s7 -; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_trunc_f32_e32 v0, s3 +; GFX8-NEXT: v_sub_f32_e32 v1, s3, v0 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 ; GFX8-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s6 -; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_trunc_f32_e32 v0, s2 +; GFX8-NEXT: v_sub_f32_e32 v1, s2, v0 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 ; GFX8-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s5 -; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_trunc_f32_e32 v0, s1 +; GFX8-NEXT: v_sub_f32_e32 v1, s1, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4 ; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s4 -; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_trunc_f32_e32 v0, s0 +; GFX8-NEXT: v_sub_f32_e32 v4, s0, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: round_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_brev_b32 s10, -2 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 +; GFX9-NEXT: s_brev_b32 s6, -2 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f32_e32 v0, s7 -; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX9-NEXT: v_trunc_f32_e32 v0, s3 +; GFX9-NEXT: v_sub_f32_e32 v1, s3, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v2 ; GFX9-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s6 -; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX9-NEXT: v_trunc_f32_e32 v0, s2 +; GFX9-NEXT: v_sub_f32_e32 v1, s2, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v2 ; GFX9-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s5 -; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: v_bfi_b32 v1, s10, v1, v4 +; GFX9-NEXT: v_trunc_f32_e32 v0, s1 +; GFX9-NEXT: v_sub_f32_e32 v1, s1, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v4 ; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s4 -; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: v_bfi_b32 v4, s10, v4, v5 +; GFX9-NEXT: v_trunc_f32_e32 v0, s0 +; GFX9-NEXT: v_sub_f32_e32 v4, s0, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: v_bfi_b32 v4, s6, v4, v5 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f32_e32 v0, s7 -; GFX11-NEXT: v_trunc_f32_e32 v1, s6 -; GFX11-NEXT: v_trunc_f32_e32 v4, s5 -; GFX11-NEXT: v_trunc_f32_e32 v5, s4 +; GFX11-NEXT: v_trunc_f32_e32 v0, s3 +; GFX11-NEXT: v_trunc_f32_e32 v1, s2 +; GFX11-NEXT: v_trunc_f32_e32 v4, s1 +; GFX11-NEXT: v_trunc_f32_e32 v5, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_sub_f32 v2, s7, v0 :: v_dual_sub_f32 v3, s6, v1 -; GFX11-NEXT: v_dual_sub_f32 v6, s5, v4 :: v_dual_sub_f32 v7, s4, v5 +; GFX11-NEXT: v_dual_sub_f32 v2, s3, v0 :: v_dual_sub_f32 v3, s2, v1 +; GFX11-NEXT: v_dual_sub_f32 v6, s1, v4 :: v_dual_sub_f32 v7, s0, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v2|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7 +; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v3|, 0.5 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2 -; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v6|, 0.5 -; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, v3, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s6 +; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v6|, 0.5 +; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, v3, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s2 -; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s6 +; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v7|, 0.5 ; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfi_b32 v6, 0x7fffffff, v6, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_bfi_b32 v6, 0x7fffffff, v6, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s6 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s4 +; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s0 ; GFX11-NEXT: v_dual_add_f32 v1, v4, v6 :: v_dual_add_f32 v0, v5, v7 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; ; R600-LABEL: round_v4f32: @@ -432,273 +432,273 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #0 { ; GFX6-LABEL: round_v8f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_brev_b32 s6, -2 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_trunc_f32_e32 v0, s7 -; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX6-NEXT: v_trunc_f32_e32 v0, s11 +; GFX6-NEXT: v_sub_f32_e32 v1, s11, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v2, s11 +; GFX6-NEXT: v_bfi_b32 v1, s6, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX6-NEXT: v_trunc_f32_e32 v0, s6 -; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX6-NEXT: v_trunc_f32_e32 v0, s10 +; GFX6-NEXT: v_sub_f32_e32 v1, s10, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_bfi_b32 v1, s6, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX6-NEXT: v_trunc_f32_e32 v0, s5 -; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v4, s5 -; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX6-NEXT: v_trunc_f32_e32 v0, s9 +; GFX6-NEXT: v_sub_f32_e32 v1, s9, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: v_bfi_b32 v1, s6, v1, v4 ; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX6-NEXT: v_trunc_f32_e32 v0, s4 -; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX6-NEXT: v_trunc_f32_e32 v0, s8 +; GFX6-NEXT: v_sub_f32_e32 v4, s8, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v5, s8 +; GFX6-NEXT: v_bfi_b32 v4, s6, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX6-NEXT: v_trunc_f32_e32 v4, s11 -; GFX6-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v6, s11 -; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX6-NEXT: v_trunc_f32_e32 v4, s15 +; GFX6-NEXT: v_sub_f32_e32 v5, s15, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v6, s15 +; GFX6-NEXT: v_bfi_b32 v5, s6, v5, v6 ; GFX6-NEXT: v_add_f32_e32 v7, v4, v5 -; GFX6-NEXT: v_trunc_f32_e32 v4, s10 -; GFX6-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX6-NEXT: v_trunc_f32_e32 v4, s14 +; GFX6-NEXT: v_sub_f32_e32 v5, s14, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v6, s14 +; GFX6-NEXT: v_bfi_b32 v5, s6, v5, v6 ; GFX6-NEXT: v_add_f32_e32 v6, v4, v5 -; GFX6-NEXT: v_trunc_f32_e32 v4, s9 -; GFX6-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v8, s9 -; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v8 +; GFX6-NEXT: v_trunc_f32_e32 v4, s13 +; GFX6-NEXT: v_sub_f32_e32 v5, s13, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v8, s13 +; GFX6-NEXT: v_bfi_b32 v5, s6, v5, v8 ; GFX6-NEXT: v_add_f32_e32 v5, v4, v5 -; GFX6-NEXT: v_trunc_f32_e32 v4, s8 -; GFX6-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v9, s8 -; GFX6-NEXT: v_bfi_b32 v8, s2, v8, v9 +; GFX6-NEXT: v_trunc_f32_e32 v4, s12 +; GFX6-NEXT: v_sub_f32_e32 v8, s12, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v9, s12 +; GFX6-NEXT: v_bfi_b32 v8, s6, v8, v9 ; GFX6-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: round_v8f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX8-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; GFX8-NEXT: s_brev_b32 s2, -2 -; GFX8-NEXT: s_mov_b32 s15, 0xf000 -; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_brev_b32 s6, -2 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_trunc_f32_e32 v0, s7 -; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX8-NEXT: v_trunc_f32_e32 v0, s11 +; GFX8-NEXT: v_sub_f32_e32 v1, s11, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NEXT: v_bfi_b32 v1, s6, v1, v2 ; GFX8-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s6 -; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX8-NEXT: v_trunc_f32_e32 v0, s10 +; GFX8-NEXT: v_sub_f32_e32 v1, s10, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_bfi_b32 v1, s6, v1, v2 ; GFX8-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s5 -; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX8-NEXT: v_trunc_f32_e32 v0, s9 +; GFX8-NEXT: v_sub_f32_e32 v1, s9, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_bfi_b32 v1, s6, v1, v4 ; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX8-NEXT: v_trunc_f32_e32 v0, s4 -; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX8-NEXT: v_trunc_f32_e32 v0, s8 +; GFX8-NEXT: v_sub_f32_e32 v4, s8, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: v_bfi_b32 v4, s6, v4, v5 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX8-NEXT: v_trunc_f32_e32 v4, s11 -; GFX8-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v6, s11 -; GFX8-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX8-NEXT: v_trunc_f32_e32 v4, s15 +; GFX8-NEXT: v_sub_f32_e32 v5, s15, v4 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, s15 +; GFX8-NEXT: v_bfi_b32 v5, s6, v5, v6 ; GFX8-NEXT: v_add_f32_e32 v7, v4, v5 -; GFX8-NEXT: v_trunc_f32_e32 v4, s10 -; GFX8-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v6, s10 -; GFX8-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX8-NEXT: v_trunc_f32_e32 v4, s14 +; GFX8-NEXT: v_sub_f32_e32 v5, s14, v4 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_bfi_b32 v5, s6, v5, v6 ; GFX8-NEXT: v_add_f32_e32 v6, v4, v5 -; GFX8-NEXT: v_trunc_f32_e32 v4, s9 -; GFX8-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: v_bfi_b32 v5, s2, v5, v8 +; GFX8-NEXT: v_trunc_f32_e32 v4, s13 +; GFX8-NEXT: v_sub_f32_e32 v5, s13, v4 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v8, s13 +; GFX8-NEXT: v_bfi_b32 v5, s6, v5, v8 ; GFX8-NEXT: v_add_f32_e32 v5, v4, v5 -; GFX8-NEXT: v_trunc_f32_e32 v4, s8 -; GFX8-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v9, s8 -; GFX8-NEXT: v_bfi_b32 v8, s2, v8, v9 +; GFX8-NEXT: v_trunc_f32_e32 v4, s12 +; GFX8-NEXT: v_sub_f32_e32 v8, s12, v4 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v9, s12 +; GFX8-NEXT: v_bfi_b32 v8, s6, v8, v9 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX8-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX8-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: round_v8f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; GFX9-NEXT: s_brev_b32 s2, -2 -; GFX9-NEXT: s_mov_b32 s15, 0xf000 -; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_brev_b32 s6, -2 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f32_e32 v0, s7 -; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_trunc_f32_e32 v0, s11 +; GFX9-NEXT: v_sub_f32_e32 v1, s11, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v2 ; GFX9-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s6 -; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_trunc_f32_e32 v0, s10 +; GFX9-NEXT: v_sub_f32_e32 v1, s10, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v2 ; GFX9-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s5 -; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX9-NEXT: v_trunc_f32_e32 v0, s9 +; GFX9-NEXT: v_sub_f32_e32 v1, s9, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v4 ; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX9-NEXT: v_trunc_f32_e32 v0, s4 -; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX9-NEXT: v_trunc_f32_e32 v0, s8 +; GFX9-NEXT: v_sub_f32_e32 v4, s8, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_bfi_b32 v4, s6, v4, v5 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_trunc_f32_e32 v4, s11 -; GFX9-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX9-NEXT: v_trunc_f32_e32 v4, s15 +; GFX9-NEXT: v_sub_f32_e32 v5, s15, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-NEXT: v_bfi_b32 v5, s6, v5, v6 ; GFX9-NEXT: v_add_f32_e32 v7, v4, v5 -; GFX9-NEXT: v_trunc_f32_e32 v4, s10 -; GFX9-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX9-NEXT: v_trunc_f32_e32 v4, s14 +; GFX9-NEXT: v_sub_f32_e32 v5, s14, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_bfi_b32 v5, s6, v5, v6 ; GFX9-NEXT: v_add_f32_e32 v6, v4, v5 -; GFX9-NEXT: v_trunc_f32_e32 v4, s9 -; GFX9-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v8 +; GFX9-NEXT: v_trunc_f32_e32 v4, s13 +; GFX9-NEXT: v_sub_f32_e32 v5, s13, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v8, s13 +; GFX9-NEXT: v_bfi_b32 v5, s6, v5, v8 ; GFX9-NEXT: v_add_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_trunc_f32_e32 v4, s8 -; GFX9-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, s8 -; GFX9-NEXT: v_bfi_b32 v8, s2, v8, v9 +; GFX9-NEXT: v_trunc_f32_e32 v4, s12 +; GFX9-NEXT: v_sub_f32_e32 v8, s12, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_bfi_b32 v8, s6, v8, v9 ; GFX9-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v8f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f32_e32 v0, s7 -; GFX11-NEXT: v_trunc_f32_e32 v1, s6 -; GFX11-NEXT: v_trunc_f32_e32 v4, s5 -; GFX11-NEXT: v_trunc_f32_e32 v8, s4 -; GFX11-NEXT: v_trunc_f32_e32 v5, s11 +; GFX11-NEXT: v_trunc_f32_e32 v0, s11 +; GFX11-NEXT: v_trunc_f32_e32 v1, s10 +; GFX11-NEXT: v_trunc_f32_e32 v4, s9 +; GFX11-NEXT: v_trunc_f32_e32 v8, s8 +; GFX11-NEXT: v_trunc_f32_e32 v5, s15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_sub_f32 v2, s7, v0 :: v_dual_sub_f32 v3, s6, v1 -; GFX11-NEXT: v_sub_f32_e32 v7, s5, v4 -; GFX11-NEXT: v_trunc_f32_e32 v9, s9 +; GFX11-NEXT: v_dual_sub_f32 v2, s11, v0 :: v_dual_sub_f32 v3, s10, v1 +; GFX11-NEXT: v_sub_f32_e32 v7, s9, v4 +; GFX11-NEXT: v_trunc_f32_e32 v9, s13 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_sub_f32_e32 v12, s11, v5 +; GFX11-NEXT: v_sub_f32_e32 v12, s15, v5 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5 -; GFX11-NEXT: v_sub_f32_e32 v11, s4, v8 -; GFX11-NEXT: v_trunc_f32_e32 v6, s10 -; GFX11-NEXT: v_sub_f32_e32 v14, s9, v9 -; GFX11-NEXT: v_trunc_f32_e32 v10, s8 +; GFX11-NEXT: v_sub_f32_e32 v11, s8, v8 +; GFX11-NEXT: v_trunc_f32_e32 v6, s14 +; GFX11-NEXT: v_sub_f32_e32 v14, s13, v9 +; GFX11-NEXT: v_trunc_f32_e32 v10, s12 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s11 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v16, 0x7fffffff, v3, s6 +; GFX11-NEXT: v_bfi_b32 v16, 0x7fffffff, v3, s10 ; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v11|, 0.5 -; GFX11-NEXT: v_sub_f32_e32 v13, s10, v6 +; GFX11-NEXT: v_sub_f32_e32 v13, s14, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v16 -; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s5 +; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s9 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1.0, s2 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v12|, 0.5 ; GFX11-NEXT: v_add_f32_e32 v1, v4, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, s4 +; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, s8 ; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1.0, s2 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v13|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v12, 0x7fffffff, v12, s11 +; GFX11-NEXT: v_bfi_b32 v12, 0x7fffffff, v12, s15 ; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1.0, s2 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v14|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_f32_e32 v7, v5, v12 -; GFX11-NEXT: v_bfi_b32 v13, 0x7fffffff, v13, s10 -; GFX11-NEXT: v_sub_f32_e32 v15, s8, v10 +; GFX11-NEXT: v_bfi_b32 v13, 0x7fffffff, v13, s14 +; GFX11-NEXT: v_sub_f32_e32 v15, s12, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, 1.0, s2 ; GFX11-NEXT: v_add_f32_e32 v6, v6, v13 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v15|, 0.5 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v14, s9 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v14, s13 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v15, 0, 1.0, s2 ; GFX11-NEXT: v_dual_add_f32 v5, v9, v0 :: v_dual_add_f32 v0, v8, v11 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v15, s8 +; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v15, s12 ; GFX11-NEXT: v_add_f32_e32 v4, v10, v4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 @@ -771,10 +771,10 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) # define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6-LABEL: round_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: v_trunc_f32_e32 v1, v0 ; GFX6-NEXT: v_sub_f32_e32 v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, 0.5 @@ -791,18 +791,18 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; ; GFX8-LABEL: round_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX8-NEXT: s_movk_i32 s5, 0x7fff +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_trunc_f16_e32 v1, s4 -; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1 +; GFX8-NEXT: v_trunc_f16_e32 v1, s6 +; GFX8-NEXT: v_sub_f16_e32 v2, s6, v1 ; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 ; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_add_f16_e32 v0, v1, v0 ; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -810,18 +810,18 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; ; GFX9-LABEL: round_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: s_movk_i32 s5, 0x7fff +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f16_e32 v1, s4 -; GFX9-NEXT: v_sub_f16_e32 v2, s4, v1 +; GFX9-NEXT: v_trunc_f16_e32 v1, s6 +; GFX9-NEXT: v_sub_f16_e32 v2, s6, v1 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_add_f16_e32 v0, v1, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -830,18 +830,18 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX11-LABEL: round_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f16_e32 v0, s4 +; GFX11-NEXT: v_trunc_f16_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v1, s4, v0 -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v1|, 0.5 +; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0 +; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -883,13 +883,13 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX6-LABEL: round_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_trunc_f32_e32 v3, v1 ; GFX6-NEXT: v_sub_f32_e32 v5, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v2, v0 @@ -914,26 +914,26 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; ; GFX8-LABEL: round_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX8-NEXT: s_movk_i32 s6, 0x7fff +; GFX8-NEXT: s_movk_i32 s5, 0x7fff ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s5, s4, 16 -; GFX8-NEXT: v_trunc_f16_e32 v1, s5 -; GFX8-NEXT: v_sub_f16_e32 v2, s5, v1 +; GFX8-NEXT: s_lshr_b32 s4, s6, 16 +; GFX8-NEXT: v_trunc_f16_e32 v1, s4 +; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1 ; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_bfi_b32 v2, s6, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_bfi_b32 v2, s5, v2, v3 ; GFX8-NEXT: v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_trunc_f16_e32 v2, s4 -; GFX8-NEXT: v_sub_f16_e32 v3, s4, v2 +; GFX8-NEXT: v_trunc_f16_e32 v2, s6 +; GFX8-NEXT: v_sub_f16_e32 v3, s6, v2 ; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5 ; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_bfi_b32 v0, s6, v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v3 ; GFX8-NEXT: v_add_f16_e32 v0, v2, v0 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -942,26 +942,26 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; ; GFX9-LABEL: round_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: s_movk_i32 s5, 0x7fff ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: v_trunc_f16_e32 v1, s5 -; GFX9-NEXT: v_sub_f16_e32 v2, s5, v1 +; GFX9-NEXT: s_lshr_b32 s4, s6, 16 +; GFX9-NEXT: v_trunc_f16_e32 v1, s4 +; GFX9-NEXT: v_sub_f16_e32 v2, s4, v1 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_bfi_b32 v2, s6, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_bfi_b32 v2, s5, v2, v3 ; GFX9-NEXT: v_add_f16_e32 v1, v1, v2 -; GFX9-NEXT: v_trunc_f16_e32 v2, s4 -; GFX9-NEXT: v_sub_f16_e32 v3, s4, v2 +; GFX9-NEXT: v_trunc_f16_e32 v2, s6 +; GFX9-NEXT: v_sub_f16_e32 v3, s6, v2 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_bfi_b32 v0, s6, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v3 ; GFX9-NEXT: v_add_f16_e32 v0, v2, v0 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -971,28 +971,28 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX11-LABEL: round_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s5, s4, 16 -; GFX11-NEXT: v_trunc_f16_e32 v1, s4 -; GFX11-NEXT: v_trunc_f16_e32 v0, s5 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_trunc_f16_e32 v1, s2 +; GFX11-NEXT: v_trunc_f16_e32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f16_e32 v3, s4, v1 -; GFX11-NEXT: v_sub_f16_e32 v2, s5, v0 +; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1 +; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v2|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s2 +; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v3|, 0.5 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s5 +; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll index d618b937910276..d69aae0b737473 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll @@ -64,7 +64,7 @@ define amdgpu_gfx void @s_set_rounding(i32 inreg %rounding) { define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; GFX6-LABEL: s_set_rounding_kernel: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX6-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX6-NEXT: ;;#ASMSTART @@ -79,7 +79,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX7-LABEL: s_set_rounding_kernel: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s2, s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX7-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX7-NEXT: ;;#ASMSTART @@ -94,7 +94,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX8-LABEL: s_set_rounding_kernel: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX8-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX8-NEXT: ;;#ASMSTART @@ -109,7 +109,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX9-LABEL: s_set_rounding_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX9-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX9-NEXT: ;;#ASMSTART @@ -124,7 +124,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX10-LABEL: s_set_rounding_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX11-LABEL: s_set_rounding_kernel: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index a177a61823dc4d..2bb89fdabda7e9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: sin_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: sin_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -46,31 +46,31 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: sin_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 ; GFX9-NEXT: v_sin_f16_e32 v1, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sin_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 ; GFX10-NEXT: v_sin_f16_e32 v1, v1 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: sin_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -89,7 +89,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: sin_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -119,7 +119,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: sin_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -140,39 +140,39 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: sin_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 ; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_sin_f16_e32 v2, v3 ; GFX9-NEXT: v_sin_f16_e32 v1, v1 ; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sin_v2f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 ; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_sin_f16_e32 v2, v3 ; GFX10-NEXT: v_sin_f16_e32 v1, v1 ; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: sin_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll index 21c4455565db80..716dd3fbd4c74d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -9,7 +9,7 @@ declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) define amdgpu_kernel void @sqrt_f16( ; SI-LABEL: sqrt_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @sqrt_f16( ; ; VI-LABEL: sqrt_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -47,7 +47,7 @@ define amdgpu_kernel void @sqrt_f16( ; ; GFX11-LABEL: sqrt_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -81,7 +81,7 @@ entry: define amdgpu_kernel void @sqrt_v2f16( ; SI-LABEL: sqrt_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -107,7 +107,7 @@ define amdgpu_kernel void @sqrt_v2f16( ; ; VI-LABEL: sqrt_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -127,7 +127,7 @@ define amdgpu_kernel void @sqrt_v2f16( ; ; GFX11-LABEL: sqrt_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index 623db04a0e90d7..47777e3853e894 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -9,7 +9,7 @@ declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) define amdgpu_kernel void @trunc_f16( ; SI-LABEL: trunc_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @trunc_f16( ; ; VI-LABEL: trunc_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -47,7 +47,7 @@ define amdgpu_kernel void @trunc_f16( ; ; GFX11-LABEL: trunc_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -82,7 +82,7 @@ entry: define amdgpu_kernel void @trunc_v2f16( ; SI-LABEL: trunc_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -108,7 +108,7 @@ define amdgpu_kernel void @trunc_v2f16( ; ; VI-LABEL: trunc_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -128,7 +128,7 @@ define amdgpu_kernel void @trunc_v2f16( ; ; GFX11-LABEL: trunc_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll index 1afd1786569b6e..3df2627128fef8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) { ; GFX6-LABEL: constant_load_v8f32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s16, s[10:11], 0x0 ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocaptur ; ; GFX12-LABEL: constant_load_v8f32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[8:9], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index e8115a3db557e7..6f95364ac36447 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_f64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; ; GFX7-HSA-LABEL: constant_load_f64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -34,7 +34,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-NOHSA-LABEL: constant_load_f64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -47,7 +47,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -66,7 +66,7 @@ attributes #0 = { nounwind } define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) { ; GFX6-NOHSA-LABEL: constant_load_2v4f64: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[24:25], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -90,7 +90,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX7-HSA-LABEL: constant_load_2v4f64: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -112,7 +112,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX8-NOHSA-LABEL: constant_load_2v4f64: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -134,7 +134,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX12-LABEL: constant_load_2v4f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[20:21], s[18:19], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index d8b4fadeebba73..c1ab63b8160c6a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: constant_load_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -65,7 +65,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: constant_load_i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -82,7 +82,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v2i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -99,7 +99,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v2i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -138,7 +138,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v2i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -153,7 +153,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v3i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -170,7 +170,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v3i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -208,7 +208,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v3i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -223,7 +223,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v4i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -240,7 +240,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v4i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -279,7 +279,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v4i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -294,7 +294,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v8i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -311,7 +311,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v8i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -350,7 +350,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v8i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -365,7 +365,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v16i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -382,7 +382,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v16i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -421,7 +421,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v16i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -436,7 +436,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v32i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -448,7 +448,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v32i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -476,7 +476,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v32i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -491,7 +491,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v64i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -504,7 +504,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v64i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -533,7 +533,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v64i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -549,7 +549,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_i1_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -566,7 +566,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_zextload_i1_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -595,7 +595,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i1_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -611,7 +611,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_i1_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -629,7 +629,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_sextload_i1_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -660,7 +660,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i1_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -678,7 +678,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -695,7 +695,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -724,7 +724,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -740,7 +740,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -758,7 +758,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -789,7 +789,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -807,7 +807,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -826,7 +826,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v3, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -860,7 +860,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] @@ -880,7 +880,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -899,7 +899,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -933,7 +933,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -953,7 +953,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -974,7 +974,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1014,7 +1014,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v3, s[2:3] @@ -1036,7 +1036,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1057,7 +1057,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1118,7 +1118,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1139,7 +1139,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1179,7 +1179,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v4, s[2:3] @@ -1201,7 +1201,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1222,7 +1222,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1264,7 +1264,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1287,7 +1287,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1313,7 +1313,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1379,7 +1379,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] @@ -1412,7 +1412,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1438,7 +1438,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1502,7 +1502,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1533,7 +1533,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -1569,7 +1569,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1684,7 +1684,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -1768,7 +1768,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1880,7 +1880,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1925,7 +1925,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -2014,7 +2014,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2221,7 +2221,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2294,7 +2294,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -2383,7 +2383,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2622,7 +2622,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2696,7 +2696,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2862,7 +2862,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[26:27], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3266,7 +3266,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3398,7 +3398,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3564,7 +3564,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[26:27], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4028,7 +4028,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4160,7 +4160,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_i1_to_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4179,7 +4179,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_zextload_i1_to_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4211,7 +4211,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i1_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4229,7 +4229,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_i1_to_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4248,7 +4248,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_sextload_i1_to_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4281,7 +4281,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i1_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4299,7 +4299,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4318,7 +4318,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4350,7 +4350,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4368,7 +4368,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4387,7 +4387,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4420,7 +4420,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4438,7 +4438,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4459,7 +4459,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4497,7 +4497,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -4520,7 +4520,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4542,7 +4542,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4581,7 +4581,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] @@ -4604,7 +4604,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4628,7 +4628,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v10, 2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, v5 @@ -4679,7 +4679,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v5, s[2:3] @@ -4707,7 +4707,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4733,7 +4733,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4788,7 +4788,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v6, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v6, s[2:3] @@ -4817,7 +4817,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4843,7 +4843,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4899,7 +4899,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -4933,7 +4933,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4962,7 +4962,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5023,7 +5023,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] @@ -5056,7 +5056,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5092,7 +5092,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5184,7 +5184,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v12, v1, s[2:3] @@ -5220,7 +5220,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5377,7 +5377,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v16, s[2:3] @@ -5425,7 +5425,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5484,7 +5484,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5651,7 +5651,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5715,7 +5715,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5787,7 +5787,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5998,7 +5998,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v32, s[2:3] @@ -6076,7 +6076,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -6183,7 +6183,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -6495,7 +6495,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6605,7 +6605,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -6770,7 +6770,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -7182,7 +7182,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7313,7 +7313,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7516,7 +7516,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -8133,7 +8133,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8341,7 +8341,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -8664,14 +8664,14 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s90, -1 ; GFX8-NEXT: s_mov_b32 s91, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; GFX8-NEXT: s_add_u32 s88, s88, s9 +; GFX8-NEXT: s_add_u32 s88, s88, s11 ; GFX8-NEXT: s_addc_u32 s89, s89, 0 ; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -9473,7 +9473,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[12:13], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 7b2ccb60d142b1..bb98af4e7a5c7f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: constant_load_i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -38,7 +38,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: constant_load_i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -77,7 +77,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -93,7 +93,7 @@ entry: define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v2i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -105,7 +105,7 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v2i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -145,7 +145,7 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v2i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -161,7 +161,7 @@ entry: define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v3i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -176,7 +176,7 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 4 @@ -194,7 +194,7 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v3i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s0, 4 @@ -248,7 +248,7 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v3i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -267,7 +267,7 @@ entry: define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v4i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -280,7 +280,7 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -293,7 +293,7 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v4i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -322,7 +322,7 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v4i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -339,7 +339,7 @@ entry: define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v8i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -354,7 +354,7 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -369,7 +369,7 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v8i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -400,7 +400,7 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v8i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -418,7 +418,7 @@ entry: define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v16i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 @@ -439,7 +439,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GCN-HSA-LABEL: constant_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_add_u32 s10, s8, 16 @@ -463,7 +463,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GCN-NOHSA-VI-LABEL: constant_load_v16i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s8, 16 @@ -512,7 +512,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v16i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -534,7 +534,7 @@ entry: define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #0 { ; GCN-NOHSA-SI-LABEL: constant_load_v16i16_align2: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -578,7 +578,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; ; GCN-HSA-LABEL: constant_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 @@ -596,7 +596,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; ; GCN-NOHSA-VI-LABEL: constant_load_v16i16_align2: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 14 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 @@ -730,7 +730,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; ; GFX12-LABEL: constant_load_v16i16_align2: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x7 @@ -772,7 +772,7 @@ entry: define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -789,7 +789,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -802,7 +802,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -831,7 +831,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_zextload_i16_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -847,7 +847,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -864,7 +864,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -877,7 +877,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -907,7 +907,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_sextload_i16_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -923,7 +923,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -940,7 +940,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -953,7 +953,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -982,7 +982,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v1i16_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -998,7 +998,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1015,7 +1015,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1028,7 +1028,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v1i16_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1074,7 +1074,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1089,7 +1089,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1104,7 +1104,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1138,7 +1138,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v2i16_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1159,7 +1159,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1174,7 +1174,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1189,7 +1189,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1224,7 +1224,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v2i16_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1244,7 +1244,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1263,7 +1263,7 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1280,7 +1280,7 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 @@ -1320,7 +1320,7 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v3i16_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1341,7 +1341,7 @@ entry: define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1360,7 +1360,7 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1377,7 +1377,7 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 @@ -1420,7 +1420,7 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v3i16_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1443,7 +1443,7 @@ entry: define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1462,7 +1462,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1481,7 +1481,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -1523,7 +1523,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v4i16_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1549,7 +1549,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1568,7 +1568,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1587,7 +1587,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -1631,7 +1631,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v4i16_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1656,7 +1656,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1717,7 +1717,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1781,7 +1781,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v8i16_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1814,7 +1814,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1843,7 +1843,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1875,7 +1875,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1941,7 +1941,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v8i16_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1971,7 +1971,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -2020,7 +2020,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2078,7 +2078,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2189,7 +2189,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v16i16_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2233,7 +2233,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -2282,7 +2282,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2340,7 +2340,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2455,7 +2455,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i16_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2499,7 +2499,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 @@ -2588,7 +2588,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2698,7 +2698,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2903,7 +2903,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v32i16_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2978,7 +2978,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 @@ -3067,7 +3067,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3177,7 +3177,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3392,7 +3392,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v32i16_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3467,7 +3467,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 @@ -3636,7 +3636,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3852,7 +3852,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -4254,7 +4254,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v64i16_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x0 @@ -4388,7 +4388,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x10 @@ -4557,7 +4557,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4773,7 +4773,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 @@ -5191,7 +5191,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v64i16_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x40 @@ -5325,7 +5325,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5343,7 +5343,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5357,7 +5357,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5389,7 +5389,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_zextload_i16_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5411,7 +5411,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5429,7 +5429,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5443,7 +5443,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5477,7 +5477,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_sextload_i16_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -5496,7 +5496,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5514,7 +5514,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5528,7 +5528,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5560,7 +5560,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v1i16_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5577,7 +5577,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5595,7 +5595,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5609,7 +5609,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5643,7 +5643,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v1i16_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -5662,7 +5662,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -5679,7 +5679,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5696,7 +5696,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5734,7 +5734,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v2i16_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5755,7 +5755,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -5773,7 +5773,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -5791,7 +5791,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -5832,7 +5832,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v2i16_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5853,7 +5853,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -5876,7 +5876,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5902,7 +5902,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5957,7 +5957,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v4i16_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5984,7 +5984,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6011,7 +6011,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6041,7 +6041,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6103,7 +6103,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v4i16_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6132,7 +6132,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6167,7 +6167,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6211,7 +6211,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6302,7 +6302,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v8i16_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6341,7 +6341,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6385,7 +6385,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6439,7 +6439,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6545,7 +6545,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v8i16_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6587,7 +6587,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 @@ -6646,7 +6646,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6726,7 +6726,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6890,7 +6890,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v16i16_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6953,7 +6953,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -7031,7 +7031,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7137,7 +7137,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7331,7 +7331,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i16_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7400,7 +7400,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 @@ -7507,7 +7507,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7659,7 +7659,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7974,7 +7974,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v32i16_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8085,7 +8085,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -8233,7 +8233,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8437,7 +8437,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -8815,7 +8815,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v32i16_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 72d19a1475682a..6eeaec12c3d148 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -22,7 +22,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX7-HSA-LABEL: constant_load_i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -34,7 +34,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-NOHSA-LABEL: constant_load_i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -62,7 +62,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-HSA-LABEL: constant_load_i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -73,7 +73,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -89,7 +89,7 @@ entry: define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v2i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -102,7 +102,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v2i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -115,7 +115,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v2i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -144,7 +144,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v2i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -156,7 +156,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -173,7 +173,7 @@ entry: define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v3i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -189,7 +189,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v3i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 @@ -238,7 +238,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v3i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -251,7 +251,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -268,7 +268,7 @@ entry: define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v4i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -283,7 +283,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v4i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -298,7 +298,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v4i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -329,7 +329,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v4i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -343,7 +343,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v4i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -361,7 +361,7 @@ entry: define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v8i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -382,7 +382,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v8i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16 @@ -406,7 +406,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v8i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 16 @@ -450,7 +450,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v8i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -469,7 +469,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v8i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -491,7 +491,7 @@ entry: define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v9i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s12, s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -516,7 +516,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v9i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s12, s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -547,7 +547,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v9i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s12, s[10:11], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -604,7 +604,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v9i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s12, s[10:11], 0x20 @@ -626,7 +626,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v9i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x20 @@ -651,7 +651,7 @@ entry: define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v10i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -677,7 +677,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v10i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -709,7 +709,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v10i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -768,7 +768,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v10i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x20 @@ -791,7 +791,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v10i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[12:13], s[10:11], 0x20 @@ -817,7 +817,7 @@ entry: define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v11i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -846,7 +846,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v11i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -879,7 +879,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v11i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -944,7 +944,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v11i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 @@ -968,7 +968,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v11i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20 @@ -994,7 +994,7 @@ entry: define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v12i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1022,7 +1022,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v12i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1056,7 +1056,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v12i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1117,7 +1117,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v12i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 @@ -1142,7 +1142,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v12i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[12:15], s[10:11], 0x20 @@ -1169,7 +1169,7 @@ entry: define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v16i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 @@ -1202,7 +1202,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v16i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 48 @@ -1244,7 +1244,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v16i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_add_u32 s18, s16, 48 @@ -1317,7 +1317,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v16i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 @@ -1347,7 +1347,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v16i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1375,7 +1375,7 @@ entry: define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i32_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1388,7 +1388,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX7-HSA-LABEL: constant_zextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX8-NOHSA-LABEL: constant_zextload_i32_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1431,7 +1431,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX9-HSA-LABEL: constant_zextload_i32_to_i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1442,7 +1442,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_zextload_i32_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1458,7 +1458,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i32_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX7-HSA-LABEL: constant_sextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1486,7 +1486,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX8-NOHSA-LABEL: constant_sextload_i32_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1517,7 +1517,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX9-HSA-LABEL: constant_sextload_i32_to_i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1530,7 +1530,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_sextload_i32_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1549,7 +1549,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1562,7 +1562,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1575,7 +1575,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1605,7 +1605,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1616,7 +1616,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1632,7 +1632,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1646,7 +1646,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1660,7 +1660,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1691,7 +1691,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1704,7 +1704,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1723,7 +1723,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1738,7 +1738,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1753,7 +1753,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1788,7 +1788,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1801,7 +1801,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1819,7 +1819,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1836,7 +1836,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1854,7 +1854,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1894,7 +1894,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -1910,7 +1910,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1931,7 +1931,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1950,7 +1950,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1972,7 +1972,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2021,7 +2021,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2038,7 +2038,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2059,7 +2059,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2083,7 +2083,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2111,7 +2111,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2169,7 +2169,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -2192,7 +2192,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2218,7 +2218,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -2245,7 +2245,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2281,7 +2281,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2363,7 +2363,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2388,7 +2388,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2415,7 +2415,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -2453,7 +2453,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2505,7 +2505,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2606,7 +2606,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -2645,7 +2645,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2681,7 +2681,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 @@ -2748,7 +2748,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2848,7 +2848,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3035,7 +3035,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 @@ -3102,7 +3102,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3159,7 +3159,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 @@ -3202,7 +3202,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3272,7 +3272,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3420,7 +3420,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3461,7 +3461,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3500,7 +3500,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 @@ -3636,7 +3636,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3826,7 +3826,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -4187,7 +4187,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -4311,7 +4311,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0 @@ -4414,7 +4414,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 @@ -4492,7 +4492,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4627,7 +4627,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4912,7 +4912,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4986,7 +4986,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 @@ -5051,7 +5051,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v32i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 @@ -5111,7 +5111,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -5194,7 +5194,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -5328,7 +5328,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v32i32: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -5379,7 +5379,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 7fd5909f3b2b1c..102c33ec31b09d 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; ; GFX7-LABEL: constant_load_i64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -34,7 +34,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: constant_load_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -63,7 +63,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -79,7 +79,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v2i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -94,7 +94,7 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v2i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 @@ -109,7 +109,7 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v2i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 @@ -140,7 +140,7 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v2i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -158,7 +158,7 @@ entry: define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v3i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -178,7 +178,7 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v3i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -201,7 +201,7 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v3i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -249,7 +249,7 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v3i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[8:9], s[2:3], 0x10 @@ -272,7 +272,7 @@ entry: define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v4i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 @@ -293,7 +293,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v4i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-NEXT: s_add_u32 s10, s8, 16 @@ -317,7 +317,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v4i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: s_add_u32 s10, s8, 16 @@ -366,7 +366,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v4i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -388,7 +388,7 @@ entry: define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v8i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 @@ -421,7 +421,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v8i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-NEXT: s_add_u32 s18, s16, 48 @@ -463,7 +463,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v8i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NEXT: s_add_u32 s18, s16, 48 @@ -550,7 +550,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v8i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -578,7 +578,7 @@ entry: define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v16i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NEXT: s_mov_b32 s39, 0xf000 @@ -638,7 +638,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: constant_load_v16i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -721,7 +721,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: constant_load_v16i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -889,7 +889,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v16i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index f14d4afbee9d97..ff55ab8859c833 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; ; GFX7-HSA-LABEL: constant_load_i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -39,7 +39,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-NOHSA-LABEL: constant_load_i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -78,7 +78,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: constant_load_i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -94,7 +94,7 @@ entry: define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v2i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -111,7 +111,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v2i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -124,7 +124,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v2i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -163,7 +163,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v2i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -179,7 +179,7 @@ entry: define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v3i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -194,7 +194,7 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v3i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -213,7 +213,7 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v3i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -274,7 +274,7 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v3i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -292,7 +292,7 @@ entry: define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v4i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -304,7 +304,7 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v4i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -316,7 +316,7 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v4i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -344,7 +344,7 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v4i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -360,7 +360,7 @@ entry: define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v8i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -373,7 +373,7 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v8i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -386,7 +386,7 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v8i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -415,7 +415,7 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v8i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -432,7 +432,7 @@ entry: define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v16i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -447,7 +447,7 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v16i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -462,7 +462,7 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v16i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -493,7 +493,7 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v16i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -511,7 +511,7 @@ entry: define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i8_to_i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -528,7 +528,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_zextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -541,7 +541,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -570,7 +570,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i8_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -586,7 +586,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i8_to_i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -603,7 +603,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_sextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -616,7 +616,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -646,7 +646,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i8_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -662,7 +662,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -679,7 +679,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -692,7 +692,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -721,7 +721,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -737,7 +737,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -754,7 +754,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -767,7 +767,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -797,7 +797,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -814,7 +814,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -833,7 +833,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -848,7 +848,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -892,7 +892,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -913,7 +913,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -932,7 +932,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -947,7 +947,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -989,7 +989,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1010,7 +1010,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1029,7 +1029,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1089,7 +1089,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1111,7 +1111,7 @@ entry: define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1130,7 +1130,7 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1212,7 +1212,7 @@ entry: define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1231,7 +1231,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1250,7 +1250,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1293,7 +1293,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1316,7 +1316,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1354,7 +1354,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1399,7 +1399,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1423,7 +1423,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1452,7 +1452,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1484,7 +1484,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1551,7 +1551,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1582,7 +1582,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1611,7 +1611,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,7 +1643,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1713,7 +1713,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1744,7 +1744,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1793,7 +1793,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1851,7 +1851,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1965,7 +1965,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2010,7 +2010,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2059,7 +2059,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2117,7 +2117,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2239,7 +2239,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2284,7 +2284,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2373,7 +2373,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2483,7 +2483,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2690,7 +2690,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2766,7 +2766,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2855,7 +2855,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2965,7 +2965,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3195,7 +3195,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3270,7 +3270,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3438,7 +3438,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3654,7 +3654,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4054,7 +4054,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4187,7 +4187,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4355,7 +4355,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4569,7 +4569,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5012,7 +5012,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5144,7 +5144,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i8_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5162,7 +5162,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_zextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5176,7 +5176,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -5208,7 +5208,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i8_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -5226,7 +5226,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i8_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5244,7 +5244,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_sextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5258,7 +5258,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5292,7 +5292,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i8_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v0, v2, s[2:3] @@ -5311,7 +5311,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5329,7 +5329,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5343,7 +5343,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5374,7 +5374,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5391,7 +5391,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5409,7 +5409,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5423,7 +5423,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5457,7 +5457,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v0, v2, s[2:3] @@ -5476,7 +5476,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5497,7 +5497,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5514,7 +5514,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -5562,7 +5562,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5582,7 +5582,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5604,7 +5604,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5622,7 +5622,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5671,7 +5671,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v4, s[2:3] @@ -5694,7 +5694,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -5717,7 +5717,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5743,7 +5743,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5801,7 +5801,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5827,7 +5827,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -5855,7 +5855,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5886,7 +5886,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5950,7 +5950,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5979,7 +5979,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6014,7 +6014,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6058,7 +6058,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6153,7 +6153,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6191,7 +6191,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6236,7 +6236,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6292,7 +6292,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6404,7 +6404,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6446,7 +6446,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6505,7 +6505,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6585,7 +6585,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6754,7 +6754,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6817,7 +6817,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6897,7 +6897,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7004,7 +7004,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7210,7 +7210,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7279,7 +7279,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -7386,7 +7386,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7538,7 +7538,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7861,7 +7861,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7972,7 +7972,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8131,7 +8131,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8339,7 +8339,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8743,7 +8743,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8885,7 +8885,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i8_to_i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -8902,7 +8902,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_zextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8915,7 +8915,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8953,7 +8953,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i8_to_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -8969,7 +8969,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i8_to_i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -8986,7 +8986,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_sextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8999,7 +8999,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9039,7 +9039,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i8_to_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] @@ -9055,7 +9055,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9072,7 +9072,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9085,7 +9085,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9123,7 +9123,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -9139,7 +9139,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9156,7 +9156,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9169,7 +9169,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9209,7 +9209,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] @@ -9225,7 +9225,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9245,7 +9245,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9261,7 +9261,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9299,7 +9299,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -9322,7 +9322,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9344,7 +9344,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9362,7 +9362,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -9413,7 +9413,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -9436,7 +9436,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9456,7 +9456,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9476,7 +9476,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9543,7 +9543,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -9567,7 +9567,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9590,7 +9590,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9613,7 +9613,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9691,7 +9691,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -9717,7 +9717,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9746,7 +9746,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -9775,7 +9775,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -9882,7 +9882,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -9913,7 +9913,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9948,7 +9948,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -9983,7 +9983,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10112,7 +10112,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -10147,7 +10147,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -10195,7 +10195,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10247,7 +10247,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10444,7 +10444,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -10490,7 +10490,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -10551,7 +10551,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10615,7 +10615,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10858,7 +10858,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -10913,7 +10913,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -10999,7 +10999,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11097,7 +11097,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11472,7 +11472,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -11548,7 +11548,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -11661,7 +11661,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11783,7 +11783,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -12252,7 +12252,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index e0c2d00891250b..7411712da31bd8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace( ; ; GCN-HSA-LABEL: global_load_i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -40,7 +40,7 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace( ; ; GCN-NOHSA-VI-LABEL: global_load_i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -115,7 +115,7 @@ entry: define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v2i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -132,7 +132,7 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -145,7 +145,7 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v2i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -200,7 +200,7 @@ entry: define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v3i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -218,7 +218,7 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -236,7 +236,7 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v3i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -332,7 +332,7 @@ entry: define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v4i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -349,7 +349,7 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -362,7 +362,7 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v4i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -417,7 +417,7 @@ entry: define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v8i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -434,7 +434,7 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -447,7 +447,7 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v8i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -502,7 +502,7 @@ entry: define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v16i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -522,7 +522,7 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -546,7 +546,7 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; ; GCN-NOHSA-VI-LABEL: global_load_v16i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -622,7 +622,7 @@ entry: define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; GCN-NOHSA-SI-LABEL: global_load_v16i16_align2: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, s10 @@ -672,7 +672,7 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; ; GCN-HSA-LABEL: global_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -696,7 +696,7 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; ; GCN-NOHSA-VI-LABEL: global_load_v16i16_align2: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -804,7 +804,7 @@ entry: define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -821,7 +821,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -834,7 +834,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -889,7 +889,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -906,7 +906,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -919,7 +919,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -977,7 +977,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -994,7 +994,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1007,7 +1007,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1079,7 +1079,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1092,7 +1092,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1150,7 +1150,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1169,7 +1169,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1184,7 +1184,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1249,7 +1249,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1268,7 +1268,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1283,7 +1283,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1369,7 +1369,7 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1385,7 +1385,7 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1458,7 +1458,7 @@ entry: define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1495,7 +1495,7 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1575,7 +1575,7 @@ entry: define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1596,7 +1596,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1613,7 +1613,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1689,7 +1689,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1711,7 +1711,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1729,7 +1729,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1833,7 +1833,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1859,7 +1859,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1956,7 +1956,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1982,7 +1982,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2008,7 +2008,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -2108,7 +2108,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -2146,7 +2146,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2196,7 +2196,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -2344,7 +2344,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -2382,7 +2382,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2432,7 +2432,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -2591,7 +2591,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -2653,7 +2653,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2751,7 +2751,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -3002,7 +3002,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -3064,7 +3064,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3162,7 +3162,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -3450,9 +3450,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s9 +; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s11 ; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -3583,7 +3583,7 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3776,14 +3776,14 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s9 +; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s11 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 @@ -4269,9 +4269,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s9 +; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s11 ; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4387,7 +4387,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -4580,14 +4580,14 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s9 +; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s11 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 @@ -5134,7 +5134,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5152,7 +5152,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5166,7 +5166,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5231,7 +5231,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5249,7 +5249,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5326,7 +5326,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5344,7 +5344,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5358,7 +5358,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5418,7 +5418,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5436,7 +5436,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5450,7 +5450,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5513,7 +5513,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5534,7 +5534,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5551,7 +5551,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5621,7 +5621,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5643,7 +5643,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5661,7 +5661,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5735,7 +5735,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5761,7 +5761,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5787,7 +5787,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5879,7 +5879,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5906,7 +5906,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5933,7 +5933,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -6030,7 +6030,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6066,7 +6066,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 @@ -6110,7 +6110,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -6248,7 +6248,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6285,7 +6285,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6330,7 +6330,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -6477,7 +6477,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6535,7 +6535,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 @@ -6621,7 +6621,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -6855,7 +6855,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6915,7 +6915,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -7003,7 +7003,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -7255,9 +7255,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s9 +; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s11 ; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, 0 @@ -7386,7 +7386,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -7541,7 +7541,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -7977,7 +7977,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -8088,7 +8088,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8262,7 +8262,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 4d7f1a9663c3dc..629343b47bc16f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; ; GCNX3-HSA-LABEL: global_load_i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -39,7 +39,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; ; GCNX3-NOHSA-LABEL: global_load_i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; ; GCN-HSA-LABEL: global_load_i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v1, v0, s[2:3] @@ -88,7 +88,7 @@ entry: define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v2i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -105,7 +105,7 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v2i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -118,7 +118,7 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v2i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -151,7 +151,7 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v2i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -167,7 +167,7 @@ entry: define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v3i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -185,7 +185,7 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v3i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v3i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -236,7 +236,7 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3] @@ -252,7 +252,7 @@ entry: define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v4i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -269,7 +269,7 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v4i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v4i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -315,7 +315,7 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v4i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -331,7 +331,7 @@ entry: define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v8i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -351,7 +351,7 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v8i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -375,7 +375,7 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v8i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -415,7 +415,7 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v8i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 @@ -434,7 +434,7 @@ entry: define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v9i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -457,7 +457,7 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v9i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -492,7 +492,7 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v9i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -543,7 +543,7 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v9i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] @@ -565,7 +565,7 @@ entry: define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v10i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -588,7 +588,7 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v10i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -623,7 +623,7 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v10i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -672,7 +672,7 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v10i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v10, s[2:3] @@ -694,7 +694,7 @@ entry: define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v11i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -718,7 +718,7 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v11i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -753,7 +753,7 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v11i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -807,7 +807,7 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v11i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3] @@ -830,7 +830,7 @@ entry: define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v12i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -853,7 +853,7 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v12i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -888,7 +888,7 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v12i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -938,7 +938,7 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v12i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] @@ -960,7 +960,7 @@ entry: define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v16i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -986,7 +986,7 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v16i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v16i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1090,7 +1090,7 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v16i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] offset:32 @@ -1115,7 +1115,7 @@ entry: define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_i32_to_i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-NOHSA-LABEL: global_zextload_i32_to_i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_zextload_i32_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3] @@ -1198,7 +1198,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_i32_to_i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1216,7 +1216,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1230,7 +1230,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-NOHSA-LABEL: global_sextload_i32_to_i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_sextload_i32_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3] @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v1i32_to_v1i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1300,7 +1300,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1314,7 +1314,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1349,7 +1349,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3] @@ -1365,7 +1365,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v1i32_to_v1i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1383,7 +1383,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3] @@ -1449,7 +1449,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v2i32_to_v2i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1470,7 +1470,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1487,7 +1487,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1528,7 +1528,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3] @@ -1547,7 +1547,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v2i32_to_v2i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1567,7 +1567,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1625,7 +1625,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] @@ -1644,7 +1644,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v4i32_to_v4i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1669,7 +1669,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1694,7 +1694,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1769,7 +1769,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v4i32_to_v4i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1795,7 +1795,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1821,7 +1821,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1877,7 +1877,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3] @@ -1902,7 +1902,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v8i32_to_v8i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1936,7 +1936,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1981,7 +1981,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -2059,7 +2059,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2091,7 +2091,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v8i32_to_v8i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -2129,7 +2129,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2179,7 +2179,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v23, s[2:3] @@ -2303,7 +2303,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2365,7 +2365,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2463,7 +2463,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2613,7 +2613,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32 @@ -2674,7 +2674,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v16i32_to_v16i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2726,7 +2726,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2810,7 +2810,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2941,7 +2941,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2995,9 +2995,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NOHSA-NEXT: s_mov_b32 s14, -1 ; SI-NOHSA-NEXT: s_mov_b32 s15, 0xe8f000 -; SI-NOHSA-NEXT: s_add_u32 s12, s12, s9 +; SI-NOHSA-NEXT: s_add_u32 s12, s12, s11 ; SI-NOHSA-NEXT: s_addc_u32 s13, s13, 0 -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -3117,7 +3117,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3310,7 +3310,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -3587,9 +3587,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA: ; %bb.0: ; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3] ; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s13 +; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15 ; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 @@ -3704,7 +3704,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCN-GFX908-HSA: ; %bb.0: -; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 @@ -3820,7 +3820,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v32i32_to_v32i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0 @@ -3908,7 +3908,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -4073,7 +4073,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -4312,7 +4312,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4398,7 +4398,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v32i32: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -4432,7 +4432,7 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v32i32: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -4524,7 +4524,7 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v32i32: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -4612,7 +4612,7 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 22bb01ba2be116..23b57a7efa586c 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -7011,24 +7011,24 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) { ; GFX12-LABEL: local_ds_fadd: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-NEXT: s_mov_b32 s6, exec_lo ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s1, s5, 4 +; GFX12-NEXT: s_add_co_i32 s1, s3, 4 ; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB28_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6 +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX12-NEXT: s_lshl_b32 s5, s1, 3 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 +; GFX12-NEXT: s_lshl_b32 s3, s1, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -7036,7 +7036,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s7, exec_lo -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX12-NEXT: s_mov_b32 s6, exec_lo @@ -7060,19 +7060,19 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_brev_b32 s0, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX12-NEXT: v_add_f32_e32 v0, s5, v0 +; GFX12-NEXT: v_add_f32_e32 v0, s3, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v1, v0, s5, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v1, v0, s3, vcc_lo ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: .LBB28_5: ; %ComputeLoop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_ctz_i32_b32 s5, s1 +; GFX12-NEXT: s_ctz_i32_b32 s3, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readlane_b32 s6, v1, s5 -; GFX12-NEXT: s_lshl_b32 s7, 1, s5 -; GFX12-NEXT: v_writelane_b32 v0, s0, s5 +; GFX12-NEXT: v_readlane_b32 s6, v1, s3 +; GFX12-NEXT: s_lshl_b32 s7, 1, s3 +; GFX12-NEXT: v_writelane_b32 v0, s0, s3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7089,14 +7089,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12-NEXT: s_cbranch_execz .LBB28_8 ; GFX12-NEXT: ; %bb.7: -; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_8: ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 @@ -7107,19 +7107,19 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX940-LABEL: local_ds_fadd: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NEXT: s_mov_b64 s[0:1], exec ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_add_i32 s5, s5, 4 +; GFX940-NEXT: s_add_i32 s3, s3, 4 ; GFX940-NEXT: ; implicit-def: $vgpr1 ; GFX940-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX940-NEXT: s_cbranch_execz .LBB28_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: s_lshl_b32 s8, s5, 3 +; GFX940-NEXT: s_lshl_b32 s8, s3, 3 ; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s8 @@ -7137,7 +7137,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: ; %bb.3: ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX940-NEXT: s_lshl_b32 s0, s5, 4 +; GFX940-NEXT: s_lshl_b32 s0, s3, 4 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f32 v2, v1 @@ -7154,11 +7154,11 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: ; implicit-def: $vgpr0 ; GFX940-NEXT: .LBB28_5: ; %ComputeLoop ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX940-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX940-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX940-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX940-NEXT: v_readfirstlane_b32 s8, v1 -; GFX940-NEXT: v_readlane_b32 s9, v2, s5 -; GFX940-NEXT: s_mov_b32 m0, s5 +; GFX940-NEXT: v_readlane_b32 s9, v2, s3 +; GFX940-NEXT: s_mov_b32 m0, s3 ; GFX940-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX940-NEXT: v_writelane_b32 v0, s8, m0 ; GFX940-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -7173,12 +7173,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: s_cbranch_execz .LBB28_8 ; GFX940-NEXT: ; %bb.7: -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB28_8: ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NEXT: v_readfirstlane_b32 s2, v2 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_nop 0 @@ -7191,23 +7191,23 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX11-LABEL: local_ds_fadd: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-NEXT: s_mov_b32 s6, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s1, s5, 4 +; GFX11-NEXT: s_add_i32 s1, s3, 4 ; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_bcnt1_i32_b32 s5, s6 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX11-NEXT: s_lshl_b32 s5, s1, 3 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 +; GFX11-NEXT: s_lshl_b32 s3, s1, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -7215,7 +7215,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_mov_b32 s7, exec_lo -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX11-NEXT: s_mov_b32 s6, exec_lo ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -7237,20 +7237,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX11-NEXT: v_add_f32_e32 v0, s5, v0 +; GFX11-NEXT: v_add_f32_e32 v0, s3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, s5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, s3, vcc_lo ; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: .LBB28_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 ; GFX11-NEXT: s_lshl_b32 s7, 1, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 -; GFX11-NEXT: v_writelane_b32 v0, s5, s1 +; GFX11-NEXT: v_writelane_b32 v0, s3, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 ; GFX11-NEXT: s_cmp_lg_u32 s0, 0 @@ -7264,13 +7264,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB28_8 ; GFX11-NEXT: ; %bb.7: -; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB28_8: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 @@ -7281,20 +7281,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX10-LABEL: local_ds_fadd: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s1, s5, 4 +; GFX10-NEXT: s_add_i32 s1, s3, 4 ; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB28_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_bcnt1_i32_b32 s5, s6 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s1, 3 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s6 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 +; GFX10-NEXT: s_lshl_b32 s3, s1, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -7303,7 +7303,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: s_mov_b32 s7, exec_lo -; GFX10-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 ; GFX10-NEXT: s_and_saveexec_b32 s6, s0 @@ -7325,17 +7325,17 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX10-NEXT: s_mov_b32 s0, exec_lo ; GFX10-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX10-NEXT: v_add_f32_e32 v0, s5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s5, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v0, s3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s3, vcc_lo ; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: .LBB28_5: ; %ComputeLoop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 -; GFX10-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_readlane_b32 s6, v2, s1 ; GFX10-NEXT: s_lshl_b32 s7, 1, s1 ; GFX10-NEXT: s_andn2_b32 s0, s0, s7 -; GFX10-NEXT: v_writelane_b32 v0, s5, s1 +; GFX10-NEXT: v_writelane_b32 v0, s3, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB28_5 @@ -7347,7 +7347,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-NEXT: s_cbranch_execz .LBB28_8 ; GFX10-NEXT: ; %bb.7: -; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -7355,8 +7355,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: .LBB28_8: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_add_f32_e32 v0, s2, v0 @@ -7367,19 +7366,19 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX90A-LABEL: local_ds_fadd: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NEXT: s_mov_b64 s[0:1], exec ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_add_i32 s5, s5, 4 +; GFX90A-NEXT: s_add_i32 s3, s3, 4 ; GFX90A-NEXT: ; implicit-def: $vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB28_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: s_lshl_b32 s8, s5, 3 +; GFX90A-NEXT: s_lshl_b32 s8, s3, 3 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -7397,7 +7396,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: ; %bb.3: ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX90A-NEXT: s_lshl_b32 s0, s5, 4 +; GFX90A-NEXT: s_lshl_b32 s0, s3, 4 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f32 v2, v1 @@ -7414,11 +7413,11 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 -; GFX90A-NEXT: v_readlane_b32 s9, v2, s5 -; GFX90A-NEXT: s_mov_b32 m0, s5 +; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: s_mov_b32 m0, s3 ; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 ; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -7433,12 +7432,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX90A-NEXT: s_cbranch_execz .LBB28_8 ; GFX90A-NEXT: ; %bb.7: -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB28_8: ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: v_readfirstlane_b32 s2, v2 ; GFX90A-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 @@ -7450,19 +7449,19 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX908-LABEL: local_ds_fadd: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX908-NEXT: s_mov_b64 s[0:1], exec ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_add_i32 s5, s5, 4 +; GFX908-NEXT: s_add_i32 s3, s3, 4 ; GFX908-NEXT: ; implicit-def: $vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_cbranch_execz .LBB28_2 ; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX908-NEXT: s_lshl_b32 s8, s5, 3 +; GFX908-NEXT: s_lshl_b32 s8, s3, 3 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s8 @@ -7480,7 +7479,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: ; %bb.3: ; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX908-NEXT: s_lshl_b32 s0, s5, 4 +; GFX908-NEXT: s_lshl_b32 s0, s3, 4 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s0 ; GFX908-NEXT: ds_add_f32 v2, v1 @@ -7497,11 +7496,11 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: ; implicit-def: $vgpr0 ; GFX908-NEXT: .LBB28_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX908-NEXT: v_readfirstlane_b32 s8, v1 -; GFX908-NEXT: v_readlane_b32 s9, v2, s5 -; GFX908-NEXT: s_mov_b32 m0, s5 +; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: s_mov_b32 m0, s3 ; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX908-NEXT: v_writelane_b32 v0, s8, m0 ; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -7516,12 +7515,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX908-NEXT: s_cbranch_execz .LBB28_8 ; GFX908-NEXT: ; %bb.7: -; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: v_mov_b32_e32 v2, s2 ; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: .LBB28_8: ; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX908-NEXT: v_readfirstlane_b32 s2, v2 ; GFX908-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX908-NEXT: v_mov_b32_e32 v2, s2 @@ -7533,20 +7532,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX8-LABEL: local_ds_fadd: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_i32 s5, s5, 4 +; GFX8-NEXT: s_add_i32 s3, s3, 4 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_cbranch_execz .LBB28_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX8-NEXT: s_lshl_b32 s8, s5, 3 +; GFX8-NEXT: s_lshl_b32 s8, s3, 3 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 @@ -7564,7 +7563,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX8-NEXT: s_lshl_b32 s0, s5, 4 +; GFX8-NEXT: s_lshl_b32 s0, s3, 4 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: ds_add_f32 v2, v1 @@ -7581,11 +7580,11 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: .LBB28_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: v_readfirstlane_b32 s8, v1 -; GFX8-NEXT: v_readlane_b32 s9, v2, s5 -; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_readlane_b32 s9, v2, s3 +; GFX8-NEXT: s_mov_b32 m0, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8-NEXT: v_writelane_b32 v0, s8, m0 ; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -7600,13 +7599,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB28_8 ; GFX8-NEXT: ; %bb.7: -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB28_8: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 @@ -7619,19 +7618,19 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX7-LABEL: local_ds_fadd: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b64 s[0:1], exec ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_i32 s5, s5, 4 +; GFX7-NEXT: s_add_i32 s3, s3, 4 ; GFX7-NEXT: ; implicit-def: $vgpr1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_cbranch_execz .LBB28_4 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_lshl_b32 s8, s5, 3 +; GFX7-NEXT: s_lshl_b32 s8, s3, 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: ds_read_b32 v1, v2 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -7661,7 +7660,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB28_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s0, s5, 4 +; GFX7-NEXT: s_lshl_b32 s0, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: ds_read_b32 v3, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] @@ -7691,13 +7690,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: ; implicit-def: $vgpr0 ; GFX7-NEXT: .LBB28_8: ; %ComputeLoop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX7-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX7-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX7-NEXT: v_readfirstlane_b32 s8, v1 -; GFX7-NEXT: v_readlane_b32 s9, v2, s5 -; GFX7-NEXT: s_mov_b32 m0, s5 +; GFX7-NEXT: v_readlane_b32 s9, v2, s3 +; GFX7-NEXT: s_mov_b32 m0, s3 ; GFX7-NEXT: v_writelane_b32 v0, s8, m0 ; GFX7-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX7-NEXT: s_and_b64 vcc, exec, s[6:7] @@ -7711,10 +7710,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB28_13 ; GFX7-NEXT: ; %bb.10: -; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v2, v3 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: .LBB28_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -7723,14 +7722,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX7-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX7-NEXT: s_cbranch_execnz .LBB28_11 ; GFX7-NEXT: ; %bb.12: ; %Flow -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7-NEXT: .LBB28_13: ; %Flow20 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7-NEXT: v_add_f32_e32 v0, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 @@ -7743,19 +7742,19 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX6-LABEL: local_ds_fadd: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s5, s5, 4 +; GFX6-NEXT: s_add_i32 s3, s3, 4 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_cbranch_execz .LBB28_4 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_lshl_b32 s8, s5, 3 +; GFX6-NEXT: s_lshl_b32 s8, s3, 3 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ds_read_b32 v1, v2 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -7785,7 +7784,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB28_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s0, s5, 4 +; GFX6-NEXT: s_lshl_b32 s0, s3, 4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_read_b32 v3, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] @@ -7815,13 +7814,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: ; implicit-def: $vgpr0 ; GFX6-NEXT: .LBB28_8: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX6-NEXT: v_readfirstlane_b32 s8, v1 -; GFX6-NEXT: v_readlane_b32 s9, v2, s5 -; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s9, v2, s3 +; GFX6-NEXT: s_mov_b32 m0, s3 ; GFX6-NEXT: v_writelane_b32 v0, s8, m0 ; GFX6-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] @@ -7835,10 +7834,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB28_13 ; GFX6-NEXT: ; %bb.10: -; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v2, v3 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_mov_b64 s[2:3], 0 ; GFX6-NEXT: .LBB28_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7847,14 +7846,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX6-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX6-NEXT: s_cbranch_execnz .LBB28_11 ; GFX6-NEXT: ; %bb.12: ; %Flow -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX6-NEXT: .LBB28_13: ; %Flow18 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v2 ; GFX6-NEXT: v_add_f32_e32 v0, s4, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 @@ -7879,31 +7878,31 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) { ; GFX12-LABEL: local_ds_fadd_one_as: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-NEXT: s_mov_b32 s6, exec_lo ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s1, s5, 4 +; GFX12-NEXT: s_add_co_i32 s1, s3, 4 ; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB29_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6 +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX12-NEXT: s_lshl_b32 s5, s1, 3 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 +; GFX12-NEXT: s_lshl_b32 s3, s1, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX12-NEXT: .LBB29_2: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s7, exec_lo ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX12-NEXT: s_mov_b32 s6, exec_lo @@ -7926,19 +7925,19 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_brev_b32 s0, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX12-NEXT: v_add_f32_e32 v0, s5, v0 +; GFX12-NEXT: v_add_f32_e32 v0, s3, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v1, v0, s5, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v1, v0, s3, vcc_lo ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: .LBB29_5: ; %ComputeLoop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_ctz_i32_b32 s5, s1 +; GFX12-NEXT: s_ctz_i32_b32 s3, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readlane_b32 s6, v1, s5 -; GFX12-NEXT: s_lshl_b32 s7, 1, s5 -; GFX12-NEXT: v_writelane_b32 v0, s0, s5 +; GFX12-NEXT: v_readlane_b32 s6, v1, s3 +; GFX12-NEXT: s_lshl_b32 s7, 1, s3 +; GFX12-NEXT: v_writelane_b32 v0, s0, s3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7954,12 +7953,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX12-NEXT: ; %bb.7: -; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 ; GFX12-NEXT: ; %bb.8: ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7971,19 +7970,19 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX940-LABEL: local_ds_fadd_one_as: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NEXT: s_mov_b64 s[0:1], exec ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_add_i32 s5, s5, 4 +; GFX940-NEXT: s_add_i32 s3, s3, 4 ; GFX940-NEXT: ; implicit-def: $vgpr1 ; GFX940-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX940-NEXT: s_cbranch_execz .LBB29_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: s_lshl_b32 s8, s5, 3 +; GFX940-NEXT: s_lshl_b32 s8, s3, 3 ; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s8 @@ -8001,7 +8000,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: ; %bb.3: ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX940-NEXT: s_lshl_b32 s0, s5, 4 +; GFX940-NEXT: s_lshl_b32 s0, s3, 4 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f32 v2, v1 @@ -8017,11 +8016,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: ; implicit-def: $vgpr0 ; GFX940-NEXT: .LBB29_5: ; %ComputeLoop ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX940-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX940-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX940-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX940-NEXT: v_readfirstlane_b32 s8, v1 -; GFX940-NEXT: v_readlane_b32 s9, v2, s5 -; GFX940-NEXT: s_mov_b32 m0, s5 +; GFX940-NEXT: v_readlane_b32 s9, v2, s3 +; GFX940-NEXT: s_mov_b32 m0, s3 ; GFX940-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX940-NEXT: v_writelane_b32 v0, s8, m0 ; GFX940-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -8035,11 +8034,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX940-NEXT: ; %bb.7: -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX940-NEXT: ; %bb.8: ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_readfirstlane_b32 s2, v2 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 @@ -8052,30 +8051,30 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: local_ds_fadd_one_as: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-NEXT: s_mov_b32 s6, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s1, s5, 4 +; GFX11-NEXT: s_add_i32 s1, s3, 4 ; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-NEXT: s_cbranch_execz .LBB29_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_bcnt1_i32_b32 s5, s6 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX11-NEXT: s_lshl_b32 s5, s1, 3 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 +; GFX11-NEXT: s_lshl_b32 s3, s1, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX11-NEXT: .LBB29_2: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_mov_b32 s7, exec_lo ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX11-NEXT: s_mov_b32 s6, exec_lo ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v2 @@ -8095,20 +8094,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX11-NEXT: v_add_f32_e32 v0, s5, v0 +; GFX11-NEXT: v_add_f32_e32 v0, s3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, s5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, s3, vcc_lo ; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: .LBB29_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_ctz_i32_b32 s1, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-NEXT: v_readlane_b32 s6, v2, s1 ; GFX11-NEXT: s_lshl_b32 s7, 1, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 -; GFX11-NEXT: v_writelane_b32 v0, s5, s1 +; GFX11-NEXT: v_writelane_b32 v0, s3, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 ; GFX11-NEXT: s_cmp_lg_u32 s0, 0 @@ -8121,11 +8120,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: ; %bb.7: -; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX11-NEXT: ; %bb.8: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8136,20 +8135,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: local_ds_fadd_one_as: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s1, s5, 4 +; GFX10-NEXT: s_add_i32 s1, s3, 4 ; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB29_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_bcnt1_i32_b32 s5, s6 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s1, 3 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s6 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 +; GFX10-NEXT: s_lshl_b32 s3, s1, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX10-NEXT: .LBB29_2: @@ -8157,7 +8156,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: s_mov_b32 s7, exec_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 ; GFX10-NEXT: s_and_saveexec_b32 s6, s0 @@ -8176,17 +8175,17 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX10-NEXT: s_mov_b32 s0, exec_lo ; GFX10-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX10-NEXT: v_add_f32_e32 v0, s5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s5, vcc_lo +; GFX10-NEXT: v_add_f32_e32 v0, s3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s3, vcc_lo ; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: .LBB29_5: ; %ComputeLoop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_ff1_i32_b32 s1, s0 -; GFX10-NEXT: v_readfirstlane_b32 s5, v1 +; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: v_readlane_b32 s6, v2, s1 ; GFX10-NEXT: s_lshl_b32 s7, 1, s1 ; GFX10-NEXT: s_andn2_b32 s0, s0, s7 -; GFX10-NEXT: v_writelane_b32 v0, s5, s1 +; GFX10-NEXT: v_writelane_b32 v0, s3, s1 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB29_5 @@ -8197,12 +8196,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX10-NEXT: ; %bb.7: -; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX10-NEXT: ; %bb.8: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -8213,19 +8212,19 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX90A-LABEL: local_ds_fadd_one_as: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NEXT: s_mov_b64 s[0:1], exec ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_add_i32 s5, s5, 4 +; GFX90A-NEXT: s_add_i32 s3, s3, 4 ; GFX90A-NEXT: ; implicit-def: $vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB29_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: s_lshl_b32 s8, s5, 3 +; GFX90A-NEXT: s_lshl_b32 s8, s3, 3 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 @@ -8243,7 +8242,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: ; %bb.3: ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX90A-NEXT: s_lshl_b32 s0, s5, 4 +; GFX90A-NEXT: s_lshl_b32 s0, s3, 4 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f32 v2, v1 @@ -8259,11 +8258,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 -; GFX90A-NEXT: v_readlane_b32 s9, v2, s5 -; GFX90A-NEXT: s_mov_b32 m0, s5 +; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: s_mov_b32 m0, s3 ; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 ; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -8277,11 +8276,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX90A-NEXT: ; %bb.7: -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX90A-NEXT: ; %bb.8: ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v2 ; GFX90A-NEXT: v_add_f32_e32 v0, s2, v0 @@ -8293,19 +8292,19 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX908-LABEL: local_ds_fadd_one_as: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX908-NEXT: s_mov_b64 s[0:1], exec ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_add_i32 s5, s5, 4 +; GFX908-NEXT: s_add_i32 s3, s3, 4 ; GFX908-NEXT: ; implicit-def: $vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_cbranch_execz .LBB29_2 ; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX908-NEXT: s_lshl_b32 s8, s5, 3 +; GFX908-NEXT: s_lshl_b32 s8, s3, 3 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s8 @@ -8323,7 +8322,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: ; %bb.3: ; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX908-NEXT: s_lshl_b32 s0, s5, 4 +; GFX908-NEXT: s_lshl_b32 s0, s3, 4 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s0 ; GFX908-NEXT: ds_add_f32 v2, v1 @@ -8339,11 +8338,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: ; implicit-def: $vgpr0 ; GFX908-NEXT: .LBB29_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX908-NEXT: v_readfirstlane_b32 s8, v1 -; GFX908-NEXT: v_readlane_b32 s9, v2, s5 -; GFX908-NEXT: s_mov_b32 m0, s5 +; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: s_mov_b32 m0, s3 ; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX908-NEXT: v_writelane_b32 v0, s8, m0 ; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -8357,11 +8356,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX908-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX908-NEXT: ; %bb.7: -; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: v_mov_b32_e32 v2, s2 ; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX908-NEXT: ; %bb.8: ; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s2, v2 ; GFX908-NEXT: v_add_f32_e32 v0, s2, v0 @@ -8373,20 +8372,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: local_ds_fadd_one_as: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_i32 s5, s5, 4 +; GFX8-NEXT: s_add_i32 s3, s3, 4 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_cbranch_execz .LBB29_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX8-NEXT: s_lshl_b32 s8, s5, 3 +; GFX8-NEXT: s_lshl_b32 s8, s3, 3 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 @@ -8404,7 +8403,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: ; %bb.3: ; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[8:9] ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX8-NEXT: s_lshl_b32 s0, s5, 4 +; GFX8-NEXT: s_lshl_b32 s0, s3, 4 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: ds_add_f32 v2, v1 @@ -8420,11 +8419,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: .LBB29_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX8-NEXT: v_readfirstlane_b32 s8, v1 -; GFX8-NEXT: v_readlane_b32 s9, v2, s5 -; GFX8-NEXT: s_mov_b32 m0, s5 +; GFX8-NEXT: v_readlane_b32 s9, v2, s3 +; GFX8-NEXT: s_mov_b32 m0, s3 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX8-NEXT: v_writelane_b32 v0, s8, m0 ; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -8438,12 +8437,12 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: ; %bb.7: -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX8-NEXT: ; %bb.8: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: v_add_f32_e32 v0, s2, v0 @@ -8456,19 +8455,19 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: local_ds_fadd_one_as: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b64 s[0:1], exec ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_i32 s5, s5, 4 +; GFX7-NEXT: s_add_i32 s3, s3, 4 ; GFX7-NEXT: ; implicit-def: $vgpr1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_cbranch_execz .LBB29_4 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_lshl_b32 s8, s5, 3 +; GFX7-NEXT: s_lshl_b32 s8, s3, 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: ds_read_b32 v1, v2 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -8498,7 +8497,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB29_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s0, s5, 4 +; GFX7-NEXT: s_lshl_b32 s0, s3, 4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: ds_read_b32 v3, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] @@ -8528,13 +8527,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: ; implicit-def: $vgpr0 ; GFX7-NEXT: .LBB29_8: ; %ComputeLoop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX7-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX7-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX7-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX7-NEXT: v_readfirstlane_b32 s8, v1 -; GFX7-NEXT: v_readlane_b32 s9, v2, s5 -; GFX7-NEXT: s_mov_b32 m0, s5 +; GFX7-NEXT: v_readlane_b32 s9, v2, s3 +; GFX7-NEXT: s_mov_b32 m0, s3 ; GFX7-NEXT: v_writelane_b32 v0, s8, m0 ; GFX7-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX7-NEXT: s_and_b64 vcc, exec, s[6:7] @@ -8548,10 +8547,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB29_13 ; GFX7-NEXT: ; %bb.10: -; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v2, v3 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: .LBB29_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -8560,14 +8559,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX7-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX7-NEXT: s_cbranch_execnz .LBB29_11 ; GFX7-NEXT: ; %bb.12: ; %Flow -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7-NEXT: .LBB29_13: ; %Flow20 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7-NEXT: v_add_f32_e32 v0, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 @@ -8580,19 +8579,19 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX6-LABEL: local_ds_fadd_one_as: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s5, s5, 4 +; GFX6-NEXT: s_add_i32 s3, s3, 4 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_cbranch_execz .LBB29_4 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_lshl_b32 s8, s5, 3 +; GFX6-NEXT: s_lshl_b32 s8, s3, 3 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ds_read_b32 v1, v2 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -8622,7 +8621,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB29_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s0, s5, 4 +; GFX6-NEXT: s_lshl_b32 s0, s3, 4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_read_b32 v3, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] @@ -8652,13 +8651,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: ; implicit-def: $vgpr0 ; GFX6-NEXT: .LBB29_8: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] -; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 +; GFX6-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s3 ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 ; GFX6-NEXT: v_readfirstlane_b32 s8, v1 -; GFX6-NEXT: v_readlane_b32 s9, v2, s5 -; GFX6-NEXT: s_mov_b32 m0, s5 +; GFX6-NEXT: v_readlane_b32 s9, v2, s3 +; GFX6-NEXT: s_mov_b32 m0, s3 ; GFX6-NEXT: v_writelane_b32 v0, s8, m0 ; GFX6-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] @@ -8672,10 +8671,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB29_13 ; GFX6-NEXT: ; %bb.10: -; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v2, v3 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_mov_b64 s[2:3], 0 ; GFX6-NEXT: .LBB29_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8684,14 +8683,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX6-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX6-NEXT: s_cbranch_execnz .LBB29_11 ; GFX6-NEXT: ; %bb.12: ; %Flow -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX6-NEXT: .LBB29_13: ; %Flow18 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v2 ; GFX6-NEXT: v_add_f32_e32 v0, s4, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll index d068e2ae4ec97f..7b919c620f8bf1 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @local_memory(ptr addrspace(1) %out) #0 { ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 16, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_barrier ; GCN-NEXT: ds_read_b32 v0, v0 @@ -51,7 +51,7 @@ define amdgpu_kernel void @local_memory_two_objects(ptr addrspace(1) %out) #0 { ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 12, v1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_barrier ; SI-NEXT: v_sub_i32_e32 v2, vcc, 28, v1 @@ -73,7 +73,7 @@ define amdgpu_kernel void @local_memory_two_objects(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_barrier ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:3 offset1:7 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 904fb974e3d700..df7460156e6556 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -19,7 +19,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-LABEL: local_stack_offset_uses_sp: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s17 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_add_u32_e32 v0, 64, v1 @@ -47,7 +47,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; MUBUF-NEXT: v_mov_b32_e32 v6, 0 ; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 ; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v5, vcc @@ -58,8 +58,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; ; FLATSCR-LABEL: local_stack_offset_uses_sp: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 @@ -81,7 +81,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_movk_i32 s0, 0x3000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:64 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v4, 0 ; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -201,7 +201,7 @@ entry: define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out) { ; MUBUF-LABEL: local_stack_offset_uses_sp_flat: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s17 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 @@ -251,7 +251,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; MUBUF-NEXT: v_mov_b32_e32 v12, 0x4000 ; MUBUF-NEXT: buffer_load_dword v3, v10, s[0:3], 0 offen offset:12 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; MUBUF-NEXT: buffer_load_dword v10, v11, s[0:3], 0 offen offset:16 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 @@ -272,8 +272,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; ; FLATSCR-LABEL: local_stack_offset_uses_sp_flat: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:1024 @@ -298,7 +298,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s0 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v12, 0 ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll index 7814eb603e5541..9999ecc6911224 100644 --- a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll +++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll @@ -14,7 +14,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_max_short_forward_branch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 @@ -26,7 +26,7 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_sleep 0 ; GCN-NEXT: .LBB0_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -55,16 +55,16 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.3: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_getpc_b64 s[2:3] ; GCN-NEXT: .Lpost_getpc0: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB1_2-.Lpost_getpc0)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB1_2-.Lpost_getpc0)>>32 -; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s2, s2, (.LBB1_2-.Lpost_getpc0)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB1_2-.Lpost_getpc0)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] ; GCN-NEXT: .LBB1_1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: v_nop_e64 @@ -73,7 +73,7 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -102,10 +102,10 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0 -; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: v_cmp_eq_f32_e64 s[2:3], s0, 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN-NEXT: s_cbranch_vccz .LBB2_1 ; GCN-NEXT: ; %bb.3: ; %bb0 ; GCN-NEXT: s_getpc_b64 s[8:9] @@ -122,7 +122,7 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -150,7 +150,7 @@ bb3: define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: min_long_forward_vbranch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -165,11 +165,11 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GCN-NEXT: s_cbranch_execnz .LBB3_1 ; GCN-NEXT: ; %bb.3: ; %bb -; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: .Lpost_getpc2: -; GCN-NEXT: s_add_u32 s4, s4, (.LBB3_2-.Lpost_getpc2)&4294967295 -; GCN-NEXT: s_addc_u32 s5, s5, (.LBB3_2-.Lpost_getpc2)>>32 -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s6, s6, (.LBB3_2-.Lpost_getpc2)&4294967295 +; GCN-NEXT: s_addc_u32 s7, s7, (.LBB3_2-.Lpost_getpc2)>>32 +; GCN-NEXT: s_setpc_b64 s[6:7] ; GCN-NEXT: .LBB3_1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; 32 bytes @@ -254,28 +254,28 @@ bb3: define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { ; GCN-LABEL: uniform_unconditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s0, 0 ; GCN-NEXT: s_mov_b64 s[0:1], -1 ; GCN-NEXT: s_cbranch_scc0 .LBB5_1 ; GCN-NEXT: ; %bb.7: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: .Lpost_getpc5: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB5_4-.Lpost_getpc5)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB5_4-.Lpost_getpc5)>>32 -; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s6, s6, (.LBB5_4-.Lpost_getpc5)&4294967295 +; GCN-NEXT: s_addc_u32 s7, s7, (.LBB5_4-.Lpost_getpc5)>>32 +; GCN-NEXT: s_setpc_b64 s[6:7] ; GCN-NEXT: .LBB5_1: ; %Flow ; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_cbranch_vccnz .LBB5_3 ; GCN-NEXT: .LBB5_2: ; %bb2 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 17 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB5_3: ; %bb4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt expcnt(0) @@ -294,17 +294,17 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: s_mov_b64 vcc, exec ; GCN-NEXT: s_cbranch_execnz .LBB5_5 ; GCN-NEXT: ; %bb.9: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: .Lpost_getpc6: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB5_2-.Lpost_getpc6)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB5_2-.Lpost_getpc6)>>32 -; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s6, s6, (.LBB5_2-.Lpost_getpc6)&4294967295 +; GCN-NEXT: s_addc_u32 s7, s7, (.LBB5_2-.Lpost_getpc6)>>32 +; GCN-NEXT: s_setpc_b64 s[6:7] ; GCN-NEXT: .LBB5_5: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: .Lpost_getpc4: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB5_3-.Lpost_getpc4)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB5_3-.Lpost_getpc4)>>32 -; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s6, s6, (.LBB5_3-.Lpost_getpc4)&4294967295 +; GCN-NEXT: s_addc_u32 s7, s7, (.LBB5_3-.Lpost_getpc4)>>32 +; GCN-NEXT: s_setpc_b64 s[6:7] bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index f2dcd151e5b59d..dd77e575e75055 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -4,12 +4,12 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_flat: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GCN-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cmp_eq_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GCN-NEXT: .LBB0_2: ; %for.body @@ -18,10 +18,10 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 ; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 -; GCN-NEXT: s_add_co_i32 s4, s4, -1 +; GCN-NEXT: s_add_co_i32 s6, s6, -1 ; GCN-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: flat_store_b128 v[4:5], v[0:3] @@ -50,12 +50,12 @@ for.end: ; preds = %for.body, %entry define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_global: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GCN-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cmp_eq_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 @@ -63,9 +63,9 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 ; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 -; GCN-NEXT: s_add_co_i32 s4, s4, -1 +; GCN-NEXT: s_add_co_i32 s6, s6, -1 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1] ; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 @@ -94,21 +94,21 @@ for.end: ; preds = %for.body, %entry define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_constant: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GCN-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cmp_eq_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB2_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: .LBB2_2: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 ; GCN-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0 -; GCN-NEXT: s_add_co_i32 s4, s4, -1 +; GCN-NEXT: s_add_co_i32 s6, s6, -1 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9 ; GCN-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11 @@ -139,7 +139,7 @@ for.end: ; preds = %for.body, %entry define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_local: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll index 5484ba1ed2fe08..30c8739032c906 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -32,7 +32,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 { ; ; GCN-LABEL: break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[4:5], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: undef_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[4:5], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -207,7 +207,7 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: constexpr_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[4:5], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -297,7 +297,7 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: true_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[4:5], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -386,7 +386,7 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: false_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[4:5], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -479,7 +479,7 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: invert_true_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[4:5], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll index 68b07bae032139..52e64197d68152 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -24,7 +24,7 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce ; GCN-NEXT: ds_write_b8 v1, v0 ; GCN-NEXT: ds_read_u8 v2, v1 offset:2 ; GCN-NEXT: ds_read_u16 v3, v1 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b8 v1, v2 offset:6 ; GCN-NEXT: ds_write_b16 v1, v3 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll index 00dcff093c7db2..bf26344a1af79f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll @@ -26,7 +26,7 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x2(ptr addrspace(1) %arg, i ; ; GCN-LABEL: no_clobber_ds_load_stores_x2: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 2 @@ -37,7 +37,7 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x2(ptr addrspace(1) %arg, i ; GCN-NEXT: ds_write_b32 v1, v2 offset:256 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:256 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: global_store_dword v1, v0, s[0:1] @@ -74,7 +74,7 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i ; ; GCN-LABEL: no_clobber_ds_load_stores_x3: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 2 ; GCN-NEXT: v_mov_b32_e32 v0, 1 @@ -88,7 +88,7 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v3, v0 offset:256 ; GCN-NEXT: ds_read_b32 v0, v0 offset:512 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index 9bbcc6988e311f..13932b39ac1a81 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -161,29 +161,31 @@ define amdgpu_kernel void @k01() { ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, f0@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, f0@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, f0@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, f0@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, f1@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, f1@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, f1@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, f1@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: s_mov_b32 s14, s16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GCN-NEXT: s_endpgm call void @f0() call void @f1() @@ -200,33 +202,37 @@ define amdgpu_kernel void @k23() { ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] +; GCN-NEXT: s_mov_b32 s20, s16 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b64 s[16:17], s[6:7] +; GCN-NEXT: s_mov_b64 s[18:19], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 1 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_mov_b64 s[4:5], s[18:19] +; GCN-NEXT: s_mov_b32 s14, s20 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 +; GCN-NEXT: s_mov_b64 s[4:5], s[18:19] +; GCN-NEXT: s_mov_b64 s[6:7], s[16:17] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GCN-NEXT: s_endpgm @@ -250,35 +256,38 @@ define amdgpu_kernel void @k123() { ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, f1@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, f1@gotpcrel32@hi+12 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, f1@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, f1@gotpcrel32@hi+12 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 0 +; GCN-NEXT: s_mov_b32 s14, s16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v1, v0 offset:16 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, f2@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, f2@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, f2@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, f2@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: ds_write_b8 v0, v1 offset:16 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_mov_b32 s15, 0 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GCN-NEXT: s_endpgm call void @f1() %ld = load i8, ptr addrspace(3) @v3 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll index 72a0aceaae12b6..dca9b71a757afb 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -226,33 +226,37 @@ define amdgpu_kernel void @k01() { ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] +; GCN-NEXT: s_mov_b32 s20, s16 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b64 s[16:17], s[6:7] +; GCN-NEXT: s_mov_b64 s[18:19], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f0@gotpcrel32@hi+12 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 0 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_mov_b64 s[4:5], s[18:19] +; GCN-NEXT: s_mov_b32 s14, s20 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 +; GCN-NEXT: s_mov_b64 s[4:5], s[18:19] +; GCN-NEXT: s_mov_b64 s[6:7], s[16:17] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GCN-NEXT: s_endpgm @@ -273,33 +277,37 @@ define amdgpu_kernel void @k23() { ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] +; GCN-NEXT: s_mov_b32 s20, s16 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b64 s[16:17], s[6:7] +; GCN-NEXT: s_mov_b64 s[18:19], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 2 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_mov_b64 s[4:5], s[18:19] +; GCN-NEXT: s_mov_b32 s14, s20 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 +; GCN-NEXT: s_mov_b64 s[4:5], s[18:19] +; GCN-NEXT: s_mov_b64 s[6:7], s[16:17] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GCN-NEXT: s_endpgm @@ -323,38 +331,42 @@ define amdgpu_kernel void @k123() { ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] +; GCN-NEXT: s_mov_b32 s20, s16 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b64 s[16:17], s[6:7] +; GCN-NEXT: s_mov_b64 s[18:19], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 1 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_mov_b64 s[4:5], s[18:19] +; GCN-NEXT: s_mov_b32 s14, s20 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v1, v0 offset:2 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: ds_write_b8 v0, v1 offset:2 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_mov_b64 s[4:5], s[18:19] +; GCN-NEXT: s_mov_b64 s[6:7], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index 79dfabe3b5450e..382f1a8c3f4316 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -7,24 +7,13 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_kernel void @workgroup_ids_kernel() { -; GFX9-SDAG-LABEL: workgroup_ids_kernel: -; GFX9-SDAG: ; %bb.0: ; %.entry -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: workgroup_ids_kernel: -; GFX9-GISEL: ; %bb.0: ; %.entry -; GFX9-GISEL-NEXT: s_mov_b32 s0, s6 -; GFX9-GISEL-NEXT: s_mov_b32 s1, s7 -; GFX9-GISEL-NEXT: s_mov_b32 s2, s8 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: workgroup_ids_kernel: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX9ARCH-SDAG-LABEL: workgroup_ids_kernel: ; GFX9ARCH-SDAG: ; %bb.0: ; %.entry @@ -83,23 +72,24 @@ define amdgpu_kernel void @caller() { ; GFX9-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-SDAG-NEXT: s_mov_b32 s38, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-SDAG-NEXT: s_add_u32 s36, s36, s9 +; GFX9-SDAG-NEXT: s_add_u32 s36, s36, s11 ; GFX9-SDAG-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-SDAG-NEXT: s_add_u32 s8, s2, 36 -; GFX9-SDAG-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-SDAG-NEXT: s_getpc_b64 s[2:3] -; GFX9-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 -; GFX9-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 -; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x0 -; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s12, s8 +; GFX9-SDAG-NEXT: s_add_u32 s8, s4, 36 +; GFX9-SDAG-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-SDAG-NEXT: s_getpc_b64 s[4:5] +; GFX9-SDAG-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 +; GFX9-SDAG-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x0 +; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-SDAG-NEXT: s_mov_b32 s12, s6 ; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[14:15] @@ -111,26 +101,29 @@ define amdgpu_kernel void @caller() { ; GFX9-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-GISEL-NEXT: s_mov_b32 s38, -1 ; GFX9-GISEL-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-GISEL-NEXT: s_add_u32 s36, s36, s9 +; GFX9-GISEL-NEXT: s_add_u32 s36, s36, s11 ; GFX9-GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-GISEL-NEXT: s_add_u32 s8, s2, 36 -; GFX9-GISEL-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-GISEL-NEXT: s_mov_b32 s14, s8 +; GFX9-GISEL-NEXT: s_add_u32 s8, s4, 36 +; GFX9-GISEL-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-GISEL-NEXT: s_mov_b64 s[12:13], s[0:1] ; GFX9-GISEL-NEXT: s_getpc_b64 s[0:1] ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s14 ; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-GISEL-NEXT: s_mov_b32 s12, s6 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[12:13] +; GFX9-GISEL-NEXT: s_mov_b32 s12, s14 ; GFX9-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[14:15] +; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX9ARCH-SDAG-LABEL: caller: @@ -139,25 +132,26 @@ define amdgpu_kernel void @caller() { ; GFX9ARCH-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9ARCH-SDAG-NEXT: s_mov_b32 s38, -1 ; GFX9ARCH-SDAG-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9ARCH-SDAG-NEXT: s_add_u32 s36, s36, s6 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s36, s36, s8 ; GFX9ARCH-SDAG-NEXT: s_addc_u32 s37, s37, 0 -; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s2, 36 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s3, 0 -; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[2:3] -; GFX9ARCH-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 -; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s4, 36 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s5, 0 +; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[4:5] +; GFX9ARCH-SDAG-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 +; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX9ARCH-SDAG-NEXT: s_endpgm ; ; GFX9ARCH-GISEL-LABEL: caller: @@ -166,51 +160,55 @@ define amdgpu_kernel void @caller() { ; GFX9ARCH-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9ARCH-GISEL-NEXT: s_mov_b32 s38, -1 ; GFX9ARCH-GISEL-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9ARCH-GISEL-NEXT: s_add_u32 s36, s36, s6 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s36, s36, s8 ; GFX9ARCH-GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s2, 36 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s3, 0 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s4, 36 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s5, 0 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[12:13], s[0:1] ; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1] ; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 ; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 -; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[12:13] ; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[14:15] ; GFX9ARCH-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: caller: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 -; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-SDAG-NEXT: s_mov_b32 s7, callee@abs32@hi -; GFX12-SDAG-NEXT: s_mov_b32 s6, callee@abs32@lo +; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX12-SDAG-NEXT: s_mov_b32 s13, callee@abs32@hi +; GFX12-SDAG-NEXT: s_mov_b32 s12, callee@abs32@lo ; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: caller: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 -; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-GISEL-NEXT: s_mov_b32 s6, callee@abs32@lo -; GFX12-GISEL-NEXT: s_mov_b32 s7, callee@abs32@hi +; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX12-GISEL-NEXT: s_mov_b32 s12, callee@abs32@lo +; GFX12-GISEL-NEXT: s_mov_b32 s13, callee@abs32@hi ; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() call void @callee(i32 %idx) #0 diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll index 40cfa4e7e4dfce..e9a1b38eee157d 100644 --- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll @@ -93,7 +93,7 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) { define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) { ; GCN-LABEL: add_u64_ss: ; GCN: s_add_u32 -; GCN: s_addc_u32 s1, s5, s7 +; GCN: s_addc_u32 s1, s1, s3 %add = add i64 %v, %a store i64 %add, ptr undef ret void diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll index 2963e7b765a0d1..82c6584f7b2567 100644 --- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -4,13 +4,13 @@ define amdgpu_kernel void @zext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: zext_shl64_to_32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: s_lshl_b32 s4, s6, 2 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -24,13 +24,13 @@ define amdgpu_kernel void @zext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 define amdgpu_kernel void @sext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: sext_shl64_to_32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0x1fffffff +; GCN-NEXT: s_and_b32 s4, s6, 0x1fffffff ; GCN-NEXT: s_lshl_b32 s4, s4, 2 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -45,13 +45,13 @@ define amdgpu_kernel void @sext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 define amdgpu_kernel void @zext_shl64_overflow(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: zext_shl64_overflow: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitset0_b32 s4, 31 +; GCN-NEXT: s_and_b32 s4, s6, 0x7fffffff ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -67,13 +67,13 @@ define amdgpu_kernel void @zext_shl64_overflow(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @sext_shl64_overflow(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: sext_shl64_overflow: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitset0_b32 s4, 31 +; GCN-NEXT: s_and_b32 s4, s6, 0x7fffffff ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -89,7 +89,7 @@ define amdgpu_kernel void @sext_shl64_overflow(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: mulu24_shl64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: v_and_b32_e32 v0, 6, v0 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -112,7 +112,7 @@ bb: define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture readonly %arg1) { ; GCN-LABEL: muli24_shl64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 0abe0f91dc0bf9..5a9259efc0cc80 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,17 +8,17 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s7, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s3, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s2, 0xffff ; VI-NEXT: s_lshr_b32 s2, s2, 16 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; CI-LABEL: s_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -54,16 +54,16 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX10-LABEL: s_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_lshrrev_b16 v1, s7, s6 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_pk_lshrrev_b16 v1, s3, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2 @@ -77,18 +77,18 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v1, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -107,7 +107,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -129,18 +129,18 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v1, v0 -; GFX10-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -165,33 +165,33 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: lshr_v_s_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s6, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_s_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v4, s0, v3 +; VI-NEXT: v_lshrrev_b16_e32 v4, s4, v3 ; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -199,53 +199,51 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; CI-LABEL: lshr_v_s_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dword s8, s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_lshr_b32 s4, s8, 16 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, s1, v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, s4, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: lshr_v_s_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_lshrrev_b16 v1, s0, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_pk_lshrrev_b16 v1, s4, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: lshr_v_s_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_lshrrev_b16 v1, s0, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: v_pk_lshrrev_b16 v1, s4, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -260,33 +258,33 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: lshr_s_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s6 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_s_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v4, v3, s0 +; VI-NEXT: v_lshrrev_b16_e64 v4, v3, s4 ; VI-NEXT: v_lshrrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -294,53 +292,51 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; CI-LABEL: lshr_s_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dword s8, s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_and_b32 s0, s0, 0xffff -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_lshr_b32 s4, s8, 16 +; CI-NEXT: s_and_b32 s5, s8, 0xffff +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshr_b32_e32 v3, s1, v3 -; CI-NEXT: v_lshr_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshr_b32_e32 v3, s4, v3 +; CI-NEXT: v_lshr_b32_e32 v2, s5, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: lshr_s_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, s4 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: lshr_s_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, s0 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, s4 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -355,18 +351,18 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_imm_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -386,7 +382,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -406,18 +402,18 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: lshr_imm_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -440,18 +436,18 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_imm_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -470,7 +466,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -487,18 +483,18 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: lshr_v_imm_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -521,19 +517,19 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v2, v0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -555,7 +551,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_lshr_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -584,19 +580,19 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_lshr_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -622,19 +618,19 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_imm_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -656,7 +652,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -675,19 +671,19 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: lshr_v_imm_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 8861ee380be031..c826980991f94f 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -38,80 +38,86 @@ declare hidden i64 @_Z14get_local_sizej(i32 noundef) local_unnamed_addr #0 define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture noundef readonly align 1 %0, ptr addrspace(1) nocapture noundef writeonly align 1 %1, ptr addrspace(1) nocapture noundef readonly align 4 %2, ptr addrspace(1) noundef align 4 %3, ptr addrspace(1) nocapture noundef readnone align 4 %4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 { ; CHECK-LABEL: kernel_round1: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_load_dwordx8 s[44:51], s[6:7], 0x0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 -; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_load_dwordx8 s[48:55], s[8:9], 0x0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 -; CHECK-NEXT: s_add_u32 s42, s34, 40 +; CHECK-NEXT: s_add_u32 s44, s34, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] -; CHECK-NEXT: s_addc_u32 s43, s35, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] -; CHECK-NEXT: s_mov_b32 s33, s14 -; CHECK-NEXT: s_mov_b32 s40, s13 -; CHECK-NEXT: s_mov_b32 s41, s12 -; CHECK-NEXT: s_mov_b64 s[38:39], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s33, s16 +; CHECK-NEXT: s_addc_u32 s45, s35, 0 +; CHECK-NEXT: s_mov_b32 s43, s14 +; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: s_mov_b32 s42, s15 +; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v45, 0 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v43, v0 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z12get_local_idj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z12get_local_idj@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v41, v0 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z7barrierj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z7barrierj@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1 +; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 -; CHECK-NEXT: global_load_dword v0, v0, s[48:49] +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: global_load_dword v0, v0, s[52:53] +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z3minjj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z3minjj@rel32@hi+12 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj@rel32@hi+12 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4 ; CHECK-NEXT: v_mov_b32_e32 v1, 12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v42, v0 -; CHECK-NEXT: s_mov_b32 s42, exec_lo +; CHECK-NEXT: s_mov_b32 s44, exec_lo ; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_25 ; CHECK-NEXT: ; %bb.1: ; %.preheader5 @@ -130,7 +136,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42 -; CHECK-NEXT: s_mov_b32 s43, 0 +; CHECK-NEXT: s_mov_b32 s45, 0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45 ; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB0_25 @@ -138,31 +144,31 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0 ; CHECK-NEXT: v_mov_b32_e32 v47, 0 -; CHECK-NEXT: s_mov_b32 s49, 0 +; CHECK-NEXT: s_mov_b32 s47, 0 ; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_8 Depth 2 ; CHECK-NEXT: ; Child Loop BB0_20 Depth 2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s49, v44 -; CHECK-NEXT: s_lshl_b32 s4, s49, 5 -; CHECK-NEXT: s_add_i32 s48, s49, 1 -; CHECK-NEXT: s_add_i32 s5, s49, 5 -; CHECK-NEXT: v_or3_b32 v57, s4, v43, s48 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s47, v44 +; CHECK-NEXT: s_lshl_b32 s4, s47, 5 +; CHECK-NEXT: s_add_i32 s46, s47, 1 +; CHECK-NEXT: s_add_i32 s5, s47, 5 +; CHECK-NEXT: v_or3_b32 v57, s4, v43, s46 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_read_u8 v56, v0 -; CHECK-NEXT: v_mov_b32_e32 v58, s48 +; CHECK-NEXT: v_mov_b32_e32 v58, s46 ; CHECK-NEXT: s_mov_b32 s52, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_17 ; CHECK-NEXT: ; %bb.6: ; %.preheader2 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: s_mov_b32 s53, 0 -; CHECK-NEXT: s_mov_b32 s54, 0 +; CHECK-NEXT: s_mov_b32 s56, 0 ; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 -; CHECK-NEXT: s_add_i32 s54, s54, 4 -; CHECK-NEXT: s_add_i32 s4, s49, s54 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s54, v57 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 +; CHECK-NEXT: s_add_i32 s56, s56, 4 +; CHECK-NEXT: s_add_i32 s4, s47, s56 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s56, v57 ; CHECK-NEXT: s_add_i32 s5, s4, 5 ; CHECK-NEXT: s_add_i32 s4, s4, 1 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42 @@ -172,103 +178,107 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_cbranch_execz .LBB0_16 ; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_add_nc_u32_e32 v59, s54, v46 -; CHECK-NEXT: v_add_nc_u32_e32 v58, s54, v57 +; CHECK-NEXT: v_add_nc_u32_e32 v59, s56, v46 +; CHECK-NEXT: v_add_nc_u32_e32 v58, s56, v57 ; CHECK-NEXT: ds_read_u8 v0, v59 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s55, s4 +; CHECK-NEXT: s_and_saveexec_b32 s57, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_incPU3AS3Vj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_incPU3AS3Vj@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s55, s4 +; CHECK-NEXT: s_and_saveexec_b32 s57, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_incPU3AS3Vj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_incPU3AS3Vj@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 ; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s55, s4 +; CHECK-NEXT: s_and_saveexec_b32 s57, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_14 ; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_incPU3AS3Vj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_incPU3AS3Vj@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 ; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s57 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s55, s4 +; CHECK-NEXT: s_and_saveexec_b32 s57, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_7 ; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_incPU3AS3Vj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_incPU3AS3Vj@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 ; CHECK-NEXT: s_branch .LBB0_7 @@ -279,7 +289,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_17: ; %Flow46 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 -; CHECK-NEXT: s_mov_b32 s49, exec_lo +; CHECK-NEXT: s_mov_b32 s47, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_23 ; CHECK-NEXT: ; %bb.18: ; %.preheader @@ -309,16 +319,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_incPU3AS3Vj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_incPU3AS3Vj@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v57 ; CHECK-NEXT: s_branch .LBB0_19 @@ -328,32 +339,33 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: .LBB0_23: ; %Flow44 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s47 ; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s48, v45 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s46, v45 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 -; CHECK-NEXT: s_mov_b32 s49, s48 +; CHECK-NEXT: s_mov_b32 s47, s46 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_or_b32 s43, s4, s43 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s43 +; CHECK-NEXT: s_or_b32 s45, s4, s45 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s45 ; CHECK-NEXT: s_cbranch_execnz .LBB0_5 ; CHECK-NEXT: .LBB0_25: ; %Flow51 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s42 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z7barrierj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z7barrierj@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_mov_b32 s4, exec_lo ; CHECK-NEXT: ds_read_b32 v47, v0 offset:15360 @@ -361,31 +373,32 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v41 ; CHECK-NEXT: s_cbranch_execz .LBB0_33 ; CHECK-NEXT: ; %bb.26: -; CHECK-NEXT: s_mov_b32 s42, 0 +; CHECK-NEXT: s_mov_b32 s44, 0 ; CHECK-NEXT: s_branch .LBB0_28 ; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s43 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z14get_local_sizej@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z14get_local_sizej@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z14get_local_sizej@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z14get_local_sizej@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41 -; CHECK-NEXT: s_or_b32 s42, vcc_lo, s42 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s42 +; CHECK-NEXT: s_or_b32 s44, vcc_lo, s44 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; CHECK-NEXT: s_cbranch_execz .LBB0_33 ; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v41 -; CHECK-NEXT: s_mov_b32 s43, exec_lo +; CHECK-NEXT: s_mov_b32 s45, exec_lo ; CHECK-NEXT: ds_read_b32 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0 @@ -394,8 +407,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62 ; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72 -; CHECK-NEXT: v_add_co_u32 v2, s4, s44, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s45, 0, s4 +; CHECK-NEXT: v_add_co_u32 v2, s4, s48, v1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s49, 0, s4 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -426,27 +439,28 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_and_b32_e32 v0, 0xf0, v0 ; CHECK-NEXT: v_and_b32_e32 v1, 15, v1 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: v_or3_b32 v2, v3, v2, v4 -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_addPU3AS1Vjj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_addPU3AS1Vjj@rel32@hi+12 ; CHECK-NEXT: v_or3_b32 v73, v2, v0, v1 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_addPU3AS1Vjj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_addPU3AS1Vjj@rel32@hi+12 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v73 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v73 ; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffc, v0 ; CHECK-NEXT: v_lshlrev_b32_e64 v44, v1, 1 ; CHECK-NEXT: v_and_b32_e32 v74, 28, v1 -; CHECK-NEXT: v_add_co_u32 v42, s4, s50, v0 -; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s51, 0, s4 +; CHECK-NEXT: v_add_co_u32 v42, s4, s54, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s55, 0, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, v44 ; CHECK-NEXT: v_mov_b32_e32 v0, v42 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: v_mov_b32_e32 v1, v43 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_bfe_u32 v0, v0, v74, 4 ; CHECK-NEXT: s_mov_b32 s4, exec_lo ; CHECK-NEXT: v_cmpx_gt_u32_e32 12, v0 @@ -455,7 +469,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1 ; CHECK-NEXT: v_xor_b32_e32 v4, v60, v58 ; CHECK-NEXT: v_lshrrev_b64 v[2:3], 16, v[56:57] -; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[46:47] +; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[50:51] ; CHECK-NEXT: v_lshlrev_b32_e32 v10, 5, v0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72 @@ -486,15 +500,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v2, v44 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_subPU3AS1Vjj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_subPU3AS1Vjj@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_subPU3AS1Vjj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_branch .LBB0_27 ; CHECK-NEXT: .LBB0_33: ; CHECK-NEXT: s_endpgm @@ -772,82 +787,88 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapture noundef readonly align 1 %.0, ptr addrspace(1) nocapture noundef writeonly align 1 %.1, ptr addrspace(1) nocapture noundef readonly align 4 %.2, ptr addrspace(1) noundef align 4 %.3, ptr addrspace(1) nocapture noundef readnone align 4 %.4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 { ; CHECK-LABEL: kernel_round1_short: ; CHECK: ; %bb.0: ; %.5 -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_load_dwordx2 s[44:45], s[6:7], 0x10 -; CHECK-NEXT: s_add_u32 s0, s0, s15 -; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_load_dwordx2 s[46:47], s[8:9], 0x10 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9] ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v40, v0 -; CHECK-NEXT: s_add_u32 s42, s36, 40 +; CHECK-NEXT: s_add_u32 s44, s38, 40 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] -; CHECK-NEXT: s_addc_u32 s43, s37, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] -; CHECK-NEXT: s_mov_b32 s33, s14 -; CHECK-NEXT: s_mov_b32 s40, s13 -; CHECK-NEXT: s_mov_b32 s41, s12 -; CHECK-NEXT: s_mov_b64 s[38:39], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s33, s16 +; CHECK-NEXT: s_addc_u32 s45, s39, 0 +; CHECK-NEXT: s_mov_b32 s43, s14 +; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: s_mov_b32 s42, s15 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] +; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v43, 0 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v42, v0 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] +; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z12get_local_idj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z12get_local_idj@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] +; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z7barrierj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z7barrierj@rel32@hi+12 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 ; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] -; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43] +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1 +; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 -; CHECK-NEXT: global_load_dword v0, v0, s[44:45] +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: global_load_dword v0, v0, s[46:47] +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z3minjj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z3minjj@rel32@hi+12 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj@rel32@hi+12 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4 ; CHECK-NEXT: v_mov_b32_e32 v1, 12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v41, v0 ; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42 -; CHECK-NEXT: s_mov_b32 s42, 0 +; CHECK-NEXT: s_mov_b32 s44, 0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364 ; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41 @@ -857,12 +878,12 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: ; Child Loop BB1_8 Depth 2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44 ; CHECK-NEXT: s_lshl_b32 s5, s4, 5 -; CHECK-NEXT: s_add_i32 s43, s4, 1 +; CHECK-NEXT: s_add_i32 s45, s4, 1 ; CHECK-NEXT: s_add_i32 s6, s4, 5 -; CHECK-NEXT: v_or3_b32 v47, s5, v42, s43 +; CHECK-NEXT: v_or3_b32 v47, s5, v42, s45 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_read_u8 v46, v0 -; CHECK-NEXT: v_mov_b32_e32 v56, s43 +; CHECK-NEXT: v_mov_b32_e32 v56, s45 ; CHECK-NEXT: s_mov_b32 s5, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41 ; CHECK-NEXT: s_cbranch_execz .LBB1_5 @@ -891,23 +912,23 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: .LBB1_5: ; %Flow4 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; CHECK-NEXT: s_mov_b32 s44, exec_lo +; CHECK-NEXT: s_mov_b32 s46, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41 ; CHECK-NEXT: s_cbranch_execz .LBB1_11 ; CHECK-NEXT: ; %bb.6: ; %.103.preheader ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_mov_b32 s45, 0 +; CHECK-NEXT: s_mov_b32 s47, 0 ; CHECK-NEXT: s_inst_prefetch 0x1 ; CHECK-NEXT: s_branch .LBB1_8 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_7: ; %.114 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48 ; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41 -; CHECK-NEXT: s_or_b32 s45, vcc_lo, s45 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s45 +; CHECK-NEXT: s_or_b32 s47, vcc_lo, s47 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s47 ; CHECK-NEXT: s_cbranch_execz .LBB1_10 ; CHECK-NEXT: .LBB1_8: ; %.103 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 @@ -916,59 +937,61 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s46, s4 +; CHECK-NEXT: s_and_saveexec_b32 s48, s4 ; CHECK-NEXT: s_cbranch_execz .LBB1_7 ; CHECK-NEXT: ; %bb.9: ; %.110 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 -; CHECK-NEXT: s_add_u32 s8, s36, 40 -; CHECK-NEXT: s_addc_u32 s9, s37, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_add_u32 s8, s38, 40 +; CHECK-NEXT: s_addc_u32 s9, s39, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_incPU3AS3Vj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_incPU3AS3Vj@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v47 ; CHECK-NEXT: s_branch .LBB1_7 ; CHECK-NEXT: .LBB1_10: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s47 ; CHECK-NEXT: .LBB1_11: ; %Flow2 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46 ; CHECK-NEXT: ; %bb.12: ; %.32 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s43, v45 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s45, v45 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_or_b32 s42, s4, s42 -; CHECK-NEXT: s_mov_b32 s4, s43 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s42 +; CHECK-NEXT: s_or_b32 s44, s4, s44 +; CHECK-NEXT: s_mov_b32 s4, s45 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 ; CHECK-NEXT: s_cbranch_execnz .LBB1_1 ; CHECK-NEXT: ; %bb.13: ; %.119 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s42 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_add_u32 s8, s36, 40 -; CHECK-NEXT: s_addc_u32 s9, s37, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] +; CHECK-NEXT: s_add_u32 s8, s38, 40 +; CHECK-NEXT: s_addc_u32 s9, s39, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s41 -; CHECK-NEXT: s_mov_b32 s13, s40 +; CHECK-NEXT: s_mov_b32 s12, s43 +; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, _Z7barrierj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, _Z7barrierj@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm .5: %.6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4 diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll index 48dc95312f5995..c5009428400539 100644 --- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @mad_u16( ; GFX8-LABEL: mad_u16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -35,39 +35,39 @@ define amdgpu_kernel void @mad_u16( ; ; GFX9-LABEL: mad_u16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[10:11] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v3, v0, s[10:11] glc +; GFX9-NEXT: global_load_ushort v3, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mad_u16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[10:11] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[8:9] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[12:13] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v3, v0, s[10:11] glc dlc +; GFX10-NEXT: global_load_ushort v3, v0, s[14:15] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mad_u16 v1, v1, v2, v3 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mad_u16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll index e876a8d9dda692..245a2775d9f2fb 100644 --- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll +++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 ; GCN-LABEL: {{^}}get_global_id_0: ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff -; GCN: s_mul_i32 [[MUL:s[0-9]+]], s10, [[WGSIZEX]] +; GCN: s_mul_i32 [[MUL:s[0-9]+]], s12, [[WGSIZEX]] ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0 define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 { %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 85b4fd0602f14c..33007e5b285d80 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -908,84 +908,84 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 { define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; CI-LABEL: mad_i64_i32_uniform: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s7 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1] -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: v_mov_b32_e32 v2, s3 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; SI-LABEL: mad_i64_i32_uniform: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mul_hi_u32 v1, s6, v0 -; SI-NEXT: s_mul_i32 s2, s6, s7 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mul_hi_u32 v1, s2, v0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mul_i32 s0, s2, s3 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; GFX9-LABEL: mad_i64_i32_uniform: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s2, s6, s7 -; GFX9-NEXT: s_add_u32 s0, s3, s0 -; GFX9-NEXT: s_addc_u32 s1, s2, s1 -; GFX9-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s3 +; GFX9-NEXT: s_mul_i32 s2, s2, s3 +; GFX9-NEXT: s_add_u32 s2, s2, s6 +; GFX9-NEXT: s_addc_u32 s3, s4, s7 +; GFX9-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: mad_i64_i32_uniform: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s6, s7 -; GFX11-NEXT: s_mul_hi_u32 s3, s6, s7 -; GFX11-NEXT: s_add_u32 s0, s2, s0 -; GFX11-NEXT: s_addc_u32 s1, s3, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_mul_i32 s6, s2, s3 +; GFX11-NEXT: s_mul_hi_u32 s3, s2, s3 +; GFX11-NEXT: s_add_u32 s2, s6, s4 +; GFX11-NEXT: s_addc_u32 s3, s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: mad_i64_i32_uniform: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b32 s7, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s2, s6 -; GFX12-NEXT: s_mov_b32 s6, s7 -; GFX12-NEXT: s_mov_b32 s7, s3 +; GFX12-NEXT: s_mov_b32 s6, s2 +; GFX12-NEXT: s_mov_b32 s2, s3 +; GFX12-NEXT: s_mov_b32 s3, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[6:7] -; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-NEXT: s_mul_u64 s[2:3], s[6:7], s[2:3] +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %ext0 = zext i32 %arg0 to i64 %ext1 = zext i32 %arg1 to i64 diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index 5bcdf4d0aaf16f..228a85058152b9 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -15,39 +15,39 @@ declare float @llvm.fabs.f32(float) nounwind readnone define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_madak_f32 v2, v2, v3, 0x41200000 -; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v5, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_madak_f32 v2, v5, v2, 0x41200000 @@ -56,95 +56,95 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX9-LABEL: madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: madak_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: madak_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 -; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: madak_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: madak_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid @@ -166,7 +166,7 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { ; GFX6-LABEL: madak_2_use_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -191,7 +191,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX8-LABEL: madak_2_use_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -221,46 +221,46 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX9-LABEL: madak_2_use_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v2, v1, v2, 0x41200000 ; GFX9-NEXT: v_mac_f32_e32 v4, v1, v3 -; GFX9-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v4, s[6:7] offset:4 +; GFX9-NEXT: global_store_dword v0, v4, s[2:3] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: madak_2_use_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc dlc +; GFX10-MAD-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v2, v1, v2, 0x41200000 ; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v3, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v2, s[4:5] +; GFX10-MAD-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[6:7] offset:4 +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] offset:4 ; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: madak_2_use_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -283,47 +283,47 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX940-FMA-LABEL: madak_2_use_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] offset:4 sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v3, v0, s[6:7] offset:8 sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 ; GFX940-FMA-NEXT: v_fmac_f32_e32 v4, v1, v3 -; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_store_dword v0, v4, s[6:7] offset:4 sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: madak_2_use_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc dlc +; GFX10-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 ; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v3, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v2, s[4:5] +; GFX10-FMA-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[6:7] offset:4 +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] offset:4 ; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: madak_2_use_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -367,7 +367,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a) #0 { ; GFX6-LABEL: madak_m_inline_imm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -383,7 +383,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX8-LABEL: madak_m_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -400,29 +400,29 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX9-LABEL: madak_m_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: madak_m_inline_imm_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: madak_m_inline_imm_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -436,30 +436,30 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX940-FMA-LABEL: madak_m_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: madak_m_inline_imm_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: madak_m_inline_imm_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -486,39 +486,39 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: madak_inline_imm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, v2, v3, 4.0 -; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: madak_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v5, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_f32 v2, v5, v2, 4.0 @@ -527,95 +527,95 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX9-LABEL: madak_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, v1, v2, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: madak_inline_imm_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, v1, v2, 4.0 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: madak_inline_imm_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: madak_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: madak_inline_imm_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: madak_inline_imm_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid @@ -635,123 +635,120 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, float %b) #0 { ; GFX6-LABEL: s_v_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dword s8, s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mac_f32_e32 v3, s0, v2 -; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_mac_f32_e32 v3, s8, v2 +; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_v_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mac_f32_e32 v2, s0, v3 +; GFX8-NEXT: v_mac_f32_e32 v2, s4, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_v_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mac_f32_e32 v2, s0, v1 -; GFX9-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-NEXT: v_mac_f32_e32 v2, s6, v1 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: s_v_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-MAD-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: v_madak_f32 v1, s0, v1, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-MAD-NEXT: v_madak_f32 v1, s4, v1, 0x41200000 +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: s_v_madak_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s4, v1 ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 -; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: s_v_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-FMA-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s0, v1 -; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s6, v1 +; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: s_v_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FMA-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: s_v_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FMA-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x41200000 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid @@ -768,32 +765,32 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: v_s_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mac_f32_e32 v3, s0, v2 -; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_mac_f32_e32 v3, s6, v2 +; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: v_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -806,46 +803,46 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a ; ; GFX9-LABEL: v_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mac_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_mac_f32_e32 v2, s2, v1 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: v_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-MAD-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX10-MAD-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-MAD-NEXT: v_madak_f32 v1, s4, v1, 0x41200000 +; GFX10-MAD-NEXT: v_madak_f32 v1, s2, v1, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: v_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] @@ -853,47 +850,47 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a ; ; GFX940-FMA-LABEL: v_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[0:1] -; GFX940-FMA-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s4, v1 +; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s2, v1 ; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: v_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-FMA-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX10-FMA-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 +; GFX10-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: v_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 +; GFX11-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -911,7 +908,7 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-LABEL: s_s_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x41200000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -925,7 +922,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX8-LABEL: s_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -937,28 +934,28 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX9-LABEL: s_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mac_f32_e32 v1, s6, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mac_f32_e32 v1, s2, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: s_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s7 -; GFX10-MAD-NEXT: v_madak_f32 v0, s6, v0, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-MAD-NEXT: v_madak_f32 v0, s2, v0, 0x41200000 +; GFX10-MAD-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: s_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v0, s2, s3 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -968,28 +965,28 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX940-FMA-LABEL: s_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s7 -; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s6, v2 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s3 +; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s2, v2 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: s_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s7 -; GFX10-FMA-NEXT: v_fmaak_f32 v0, s6, v0, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-FMA-NEXT: v_fmaak_f32 v0, s2, v0, 0x41200000 +; GFX10-FMA-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: s_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1005,40 +1002,40 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: no_madak_src0_modifier_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_mov_b32 s0, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b32 s4, 0x41200000 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mad_f32 v2, |v2|, v3, s0 -; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_mad_f32 v2, |v2|, v3, s4 +; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: no_madak_src0_modifier_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v5, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: s_mov_b32 s0, 0x41200000 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1048,97 +1045,97 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX9-LABEL: no_madak_src0_modifier_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, 0x41200000 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: s_mov_b32 s2, 0x41200000 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_f32 v1, |v1|, v2, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mad_f32 v1, |v1|, v2, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: no_madak_src0_modifier_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, |v1|, v2, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: no_madak_src0_modifier_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v1, |v1|, v2 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 -; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] -; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] +; GFX940-FMA-NEXT: s_mov_b32 s2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: v_fma_f32 v1, |v1|, v2, s0 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-FMA-NEXT: v_fma_f32 v1, |v1|, v2, s2 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid @@ -1159,40 +1156,40 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: no_madak_src1_modifier_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_mov_b32 s0, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b32 s4, 0x41200000 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mad_f32 v2, v2, |v3|, s0 -; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_mad_f32 v2, v2, |v3|, s4 +; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: no_madak_src1_modifier_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v5, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: s_mov_b32 s0, 0x41200000 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1202,97 +1199,97 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX9-LABEL: no_madak_src1_modifier_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, 0x41200000 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: s_mov_b32 s2, 0x41200000 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_f32 v1, v1, |v2|, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_mad_f32 v1, v1, |v2|, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: no_madak_src1_modifier_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, v1, |v2|, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: no_madak_src1_modifier_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v1, v1, |v2| ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 -; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] -; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] +; GFX940-FMA-NEXT: s_mov_b32 s2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: v_fma_f32 v1, v1, |v2|, s0 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-FMA-NEXT: v_fma_f32 v1, v1, |v2|, s2 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid @@ -1316,34 +1313,34 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { ; GFX6-LABEL: madak_constant_bus_violation: ; GFX6: ; %bb.0: ; %bb -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX6-NEXT: ; %bb.1: ; %bb3 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: .LBB9_2: ; %bb4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x12 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x12 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mac_f32_e64 v1, s0, 0.5 ; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: madak_constant_bus_violation: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB9_2 @@ -1354,7 +1351,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX8-NEXT: .LBB9_2: ; %bb4 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x48 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mac_f32_e64 v1, s0, 0.5 @@ -1365,7 +1362,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX9-LABEL: madak_constant_bus_violation: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB9_2 @@ -1376,7 +1373,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX9-NEXT: .LBB9_2: ; %bb4 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x48 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mac_f32_e64 v1, s0, 0.5 @@ -1387,7 +1384,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX10-MAD-LABEL: madak_constant_bus_violation: ; GFX10-MAD: ; %bb.0: ; %bb -; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-MAD-NEXT: s_cbranch_scc1 .LBB9_2 @@ -1398,7 +1395,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX10-MAD-NEXT: .LBB9_2: ; %bb4 ; GFX10-MAD-NEXT: global_load_dword v0, v[0:1], off glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX10-MAD-NEXT: s_load_dword s0, s[4:5], 0x48 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, s0, v1, 0x42280000 @@ -1409,7 +1406,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX11-MAD-LABEL: madak_constant_bus_violation: ; GFX11-MAD: ; %bb.0: ; %bb -; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-MAD-NEXT: s_cbranch_scc1 .LBB9_2 @@ -1420,7 +1417,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-MAD-NEXT: .LBB9_2: ; %bb4 ; GFX11-MAD-NEXT: global_load_b32 v0, v[0:1], off glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x48 +; GFX11-MAD-NEXT: s_load_b32 s0, s[4:5], 0x48 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v1, s0, 0.5 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1432,7 +1429,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX940-FMA-LABEL: madak_constant_bus_violation: ; GFX940-FMA: ; %bb.0: ; %bb -; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: s_cmp_lg_u32 s0, 0 ; GFX940-FMA-NEXT: s_cbranch_scc1 .LBB9_2 @@ -1443,7 +1440,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX940-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX940-FMA-NEXT: global_load_dword v0, v[0:1], off sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX940-FMA-NEXT: s_load_dword s0, s[4:5], 0x48 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: v_fmac_f32_e64 v1, s0, 0.5 @@ -1454,7 +1451,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX10-FMA-LABEL: madak_constant_bus_violation: ; GFX10-FMA: ; %bb.0: ; %bb -; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-FMA-NEXT: s_cbranch_scc1 .LBB9_2 @@ -1465,7 +1462,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX10-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX10-FMA-NEXT: global_load_dword v0, v[0:1], off glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX10-FMA-NEXT: s_load_dword s0, s[4:5], 0x48 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x42280000 @@ -1476,7 +1473,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX11-FMA-LABEL: madak_constant_bus_violation: ; GFX11-FMA: ; %bb.0: ; %bb -; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-FMA-NEXT: s_cbranch_scc1 .LBB9_2 @@ -1487,7 +1484,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX11-FMA-NEXT: global_load_b32 v0, v[0:1], off glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x48 +; GFX11-FMA-NEXT: s_load_b32 s0, s[4:5], 0x48 ; GFX11-FMA-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll index 6910004a9ef505..401724443567af 100644 --- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll @@ -6,13 +6,13 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX9-LABEL: test: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x1c -; GFX9-NEXT: s_load_dword s5, s[6:7], 0x38 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s4, s[8:9], 0x1c +; GFX9-NEXT: s_load_dword s5, s[8:9], 0x38 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_mul_i32 s10, s10, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s10 +; GFX9-NEXT: s_mul_i32 s12, s12, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s12 ; GFX9-NEXT: v_add_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] @@ -34,13 +34,13 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX10-LABEL: test: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x1c -; GFX10-NEXT: s_load_dword s5, s[6:7], 0x38 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s4, s[8:9], 0x1c +; GFX10-NEXT: s_load_dword s5, s[8:9], 0x38 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s4, s4, 0xffff -; GFX10-NEXT: s_mul_i32 s10, s10, s4 -; GFX10-NEXT: v_add3_u32 v0, s5, s10, v0 +; GFX10-NEXT: s_mul_i32 s12, s12, s4 +; GFX10-NEXT: v_add3_u32 v0, s5, s12, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 @@ -59,16 +59,16 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX11-LABEL: test: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x1c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x38 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x1c +; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x38 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_and_b32 s4, s6, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_i32 s13, s13, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_add3_u32 v0, s5, s13, v0 +; GFX11-NEXT: v_add3_u32 v0, s7, s13, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll index 302b140e32f3aa..95914857b87328 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll @@ -1639,7 +1639,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v3, 5, s32 -; GFX10_1-NEXT: s_lshl_b32 s4, s6, 2 +; GFX10_1-NEXT: s_lshl_b32 s4, s16, 2 ; GFX10_1-NEXT: v_writelane_b32 v2, s59, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: v_add_nc_u32_e32 v1, s4, v3 @@ -1670,7 +1670,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_3-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v3, 5, s32 -; GFX10_3-NEXT: s_lshl_b32 s4, s6, 2 +; GFX10_3-NEXT: s_lshl_b32 s4, s16, 2 ; GFX10_3-NEXT: v_writelane_b32 v2, s59, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: v_add_nc_u32_e32 v1, s4, v3 @@ -1761,12 +1761,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s7, s32, 0x201000 -; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s7 ; 4-byte Folded Spill +; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 +; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040 -; GFX8-NEXT: s_lshl_b32 s4, s6, 2 +; GFX8-NEXT: s_lshl_b32 s4, s16, 2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_writelane_b32 v2, s59, 0 @@ -1792,11 +1792,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX900-NEXT: s_add_i32 s7, s32, 0x201000 -; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s7 ; 4-byte Folded Spill +; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 +; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX900-NEXT: s_lshl_b32 s4, s6, 2 +; GFX900-NEXT: s_lshl_b32 s4, s16, 2 ; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0 ; GFX900-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX900-NEXT: v_writelane_b32 v2, s59, 0 diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll index 9b413f95abae36..af713179a888dd 100644 --- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll +++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX10-LABEL: long_store_chain: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_mov_b32 s1, s0 @@ -91,7 +91,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; ; GFX11-LABEL: long_store_chain: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 @@ -174,7 +174,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; ; GFX12-LABEL: long_store_chain: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s1, s0 @@ -393,7 +393,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) { ; GFX10-LABEL: long_load_chain: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3e ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -666,7 +666,7 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) { ; ; GFX11-LABEL: long_load_chain: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -940,7 +940,7 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) { ; ; GFX12-LABEL: long_load_chain: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1f ; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll index a8139cc6bc4c95..1857eaba0a2a97 100644 --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -6,20 +6,20 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v5, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i16_e32 v2, v5, v2 @@ -28,15 +28,15 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imax_sge_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid @@ -54,20 +54,20 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i16_e32 v3, v5, v2 @@ -78,15 +78,15 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid @@ -104,15 +104,15 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc @@ -122,8 +122,8 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v8, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v6 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -139,25 +139,25 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 +; GFX9-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_dword v4, v0, s[6:7] +; GFX9-NEXT: global_load_dword v4, v0, s[2:3] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_short_d16 v1, v0, s[0:1] offset:4 +; GFX9-NEXT: global_load_short_d16 v1, v0, s[6:7] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_max_i16 v3, v4, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] offset:4 -; GFX9-NEXT: global_store_dword v0, v3, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v0, v3, s[0:1] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <3 x i16>, ptr addrspace(1) %aptr, i32 %tid @@ -175,20 +175,20 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i16_e32 v6, v1, v3 @@ -202,16 +202,16 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <4 x i16>, ptr addrspace(1) %aptr, i32 %tid @@ -229,20 +229,20 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sgt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v5, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i16_e32 v2, v5, v2 @@ -251,15 +251,15 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imax_sgt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid @@ -277,20 +277,20 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_umax_uge_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v5, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_u16_e32 v2, v5, v2 @@ -299,15 +299,15 @@ define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umax_uge_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid @@ -325,20 +325,20 @@ define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_umax_ugt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v5, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_u16_e32 v2, v5, v2 @@ -347,15 +347,15 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umax_ugt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid @@ -372,20 +372,20 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_umax_ugt_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_u16_e32 v3, v5, v2 @@ -396,15 +396,15 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umax_ugt_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_u16 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index 4fb90bbc46a8f5..3d8d849ad32421 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -5,23 +5,23 @@ define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SI-NEXT: v_max_i32_e32 v0, s2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sge_i32: @@ -58,26 +58,26 @@ define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_imax_sge_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sge_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_i32_e32 v3, s7, v3 -; SI-NEXT: v_max_i32_e32 v2, s6, v2 -; SI-NEXT: v_max_i32_e32 v1, s5, v1 -; SI-NEXT: v_max_i32_e32 v0, s4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: v_max_i32_e32 v3, s11, v3 +; SI-NEXT: v_max_i32_e32 v2, s10, v2 +; SI-NEXT: v_max_i32_e32 v1, s9, v1 +; SI-NEXT: v_max_i32_e32 v0, s8, v0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sge_v4i32: @@ -116,7 +116,7 @@ define amdgpu_kernel void @v_test_imax_sge_v4i32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_imax_sge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -146,12 +146,12 @@ define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @s_test_imax_sge_imm_i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_test_imax_sge_imm_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_max_i32 s4, s4, 9 +; SI-NEXT: s_max_i32 s4, s6, 9 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -175,24 +175,24 @@ define amdgpu_kernel void @s_test_imax_sge_imm_i32(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sge_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_sbyte v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_i32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sge_i8: @@ -240,12 +240,12 @@ define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @s_test_imax_sgt_imm_i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_test_imax_sgt_imm_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_max_i32 s4, s4, 9 +; SI-NEXT: s_max_i32 s4, s6, 9 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -269,7 +269,7 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_i32(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: s_test_imax_sgt_imm_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -303,23 +303,23 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_imax_sgt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sgt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SI-NEXT: v_max_i32_e32 v0, s2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sgt_i32: @@ -355,7 +355,7 @@ define amdgpu_kernel void @v_test_imax_sgt_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_imax_sgt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -385,23 +385,23 @@ define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @v_test_umax_uge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_umax_uge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SI-NEXT: v_max_u32_e32 v0, s2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_uge_i32: @@ -437,7 +437,7 @@ define amdgpu_kernel void @v_test_umax_uge_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_umax_uge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -467,20 +467,20 @@ define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32> %a, <3 x i32> %b) nounwind { ; SI-LABEL: s_test_umax_uge_v3i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_max_u32 s2, s6, s10 -; SI-NEXT: s_max_u32 s0, s5, s9 -; SI-NEXT: s_max_u32 s1, s4, s8 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8 +; SI-NEXT: s_max_u32 s6, s10, s14 +; SI-NEXT: s_max_u32 s4, s9, s13 +; SI-NEXT: s_max_u32 s5, s8, s12 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: s_test_umax_uge_v3i32: @@ -507,24 +507,24 @@ define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32 define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_umax_uge_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_u32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_uge_i8: @@ -565,20 +565,20 @@ define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_umax_ugt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dword s0, s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SI-NEXT: v_max_u32_e32 v0, s0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_ugt_i32: @@ -614,7 +614,7 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_umax_ugt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -644,7 +644,7 @@ define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: s_test_umax_ugt_imm_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -680,14 +680,14 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) nounwind { ; SI-LABEL: simplify_demanded_bits_test_umax_ugt_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dword s7, s[4:5], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_and_b32 s5, s7, 0xffff ; SI-NEXT: s_max_u32 s4, s4, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -727,14 +727,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(ptr addrspac define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) nounwind { ; SI-LABEL: simplify_demanded_bits_test_max_slt_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dword s7, s[4:5], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i16 s4, s4 -; SI-NEXT: s_sext_i32_i16 s5, s5 +; SI-NEXT: s_sext_i32_i16 s4, s6 +; SI-NEXT: s_sext_i32_i16 s5, s7 ; SI-NEXT: s_max_i32 s4, s4, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -773,14 +773,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace define amdgpu_kernel void @s_test_imax_sge_i16(ptr addrspace(1) %out, [8 x i32], i16 %a, [8 x i32], i16 %b) nounwind { ; SI-LABEL: s_test_imax_sge_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dword s7, s[4:5], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i16 s4, s4 -; SI-NEXT: s_sext_i32_i16 s5, s5 +; SI-NEXT: s_sext_i32_i16 s4, s6 +; SI-NEXT: s_sext_i32_i16 s5, s7 ; SI-NEXT: s_max_i32 s4, s4, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -826,22 +826,22 @@ define amdgpu_kernel void @s_test_imax_sge_i16(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_umax_ugt_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cselect_b32 s4, s7, s9 -; SI-NEXT: s_cselect_b32 s5, s6, s8 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s0, s3, s9 +; SI-NEXT: s_cselect_b32 s1, s2, s8 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: test_umax_ugt_i64: @@ -868,22 +868,22 @@ define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_umax_uge_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cselect_b32 s4, s7, s9 -; SI-NEXT: s_cselect_b32 s5, s6, s8 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1] +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s0, s3, s9 +; SI-NEXT: s_cselect_b32 s1, s2, s8 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: test_umax_uge_i64: @@ -910,22 +910,22 @@ define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_imax_sgt_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_cmp_gt_i64_e32 vcc, s[6:7], v[0:1] -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cselect_b32 s4, s7, s9 -; SI-NEXT: s_cselect_b32 s5, s6, s8 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s0, s3, s9 +; SI-NEXT: s_cselect_b32 s1, s2, s8 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: test_imax_sgt_i64: @@ -952,22 +952,22 @@ define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 define amdgpu_kernel void @test_imax_sge_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_imax_sge_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cselect_b32 s4, s7, s9 -; SI-NEXT: s_cselect_b32 s5, s6, s8 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1] +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s0, s3, s9 +; SI-NEXT: s_cselect_b32 s1, s2, s8 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: test_imax_sge_i64: diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index 25a6c80b917946..5e46fd6b28d275 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -824,7 +824,7 @@ define half @v_maximumnum_f16_s_v(half inreg %x, half %y) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX8-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX8-NEXT: v_max_f16_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -832,7 +832,7 @@ define half @v_maximumnum_f16_s_v(half inreg %x, half %y) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX9-NEXT: v_max_f16_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -840,7 +840,7 @@ define half @v_maximumnum_f16_s_v(half inreg %x, half %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX10-NEXT: v_max_f16_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -873,7 +873,7 @@ define half @v_maximumnum_f16_v_s(half %x, half inreg %y) { ; GFX8-LABEL: v_maximumnum_f16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX8-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -881,7 +881,7 @@ define half @v_maximumnum_f16_v_s(half %x, half inreg %y) { ; GFX9-LABEL: v_maximumnum_f16_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -889,7 +889,7 @@ define half @v_maximumnum_f16_v_s(half %x, half inreg %y) { ; GFX10-LABEL: v_maximumnum_f16_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -923,24 +923,24 @@ define half @v_maximumnum_f16_s_s(half inreg %x, half inreg %y) { ; GFX8-LABEL: v_maximumnum_f16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e64 v0, s7, s7 -; GFX8-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX8-NEXT: v_max_f16_e64 v0, s17, s17 +; GFX8-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX8-NEXT: v_max_f16_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximumnum_f16_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v0, s7, s7 -; GFX9-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f16_e64 v0, s17, s17 +; GFX9-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX9-NEXT: v_max_f16_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximumnum_f16_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e64 v0, s7, s7 -; GFX10-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f16_e64 v0, s17, s17 +; GFX10-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX10-NEXT: v_max_f16_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -974,7 +974,7 @@ define float @v_maximumnum_f32_s_v(float inreg %x, float %y) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -982,7 +982,7 @@ define float @v_maximumnum_f32_s_v(float inreg %x, float %y) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX9-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f32_e64 v1, s16, s16 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -990,7 +990,7 @@ define float @v_maximumnum_f32_s_v(float inreg %x, float %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f32_e64 v1, s16, s16 ; GFX10-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1023,7 +1023,7 @@ define float @v_maximumnum_f32_v_s(float %x, float inreg %y) { ; GFX8-LABEL: v_maximumnum_f32_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1031,7 +1031,7 @@ define float @v_maximumnum_f32_v_s(float %x, float inreg %y) { ; GFX9-LABEL: v_maximumnum_f32_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f32_e64 v1, s16, s16 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1039,7 +1039,7 @@ define float @v_maximumnum_f32_v_s(float %x, float inreg %y) { ; GFX10-LABEL: v_maximumnum_f32_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f32_e64 v1, s16, s16 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1073,24 +1073,24 @@ define float @v_maximumnum_f32_s_s(float inreg %x, float inreg %y) { ; GFX8-LABEL: v_maximumnum_f32_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, s7 -; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximumnum_f32_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e64 v0, s7, s7 -; GFX9-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f32_e64 v0, s17, s17 +; GFX9-NEXT: v_max_f32_e64 v1, s16, s16 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximumnum_f32_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v0, s7, s7 -; GFX10-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f32_e64 v0, s17, s17 +; GFX10-NEXT: v_max_f32_e64 v1, s16, s16 ; GFX10-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1124,7 +1124,7 @@ define double @v_maximumnum_f64_s_v(double inreg %x, double %y) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1132,14 +1132,14 @@ define double @v_maximumnum_f64_s_v(double inreg %x, double %y) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximumnum_f64_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX10-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1173,7 +1173,7 @@ define double @v_maximumnum_f64_v_s(double %x, double inreg %y) { ; GFX8-LABEL: v_maximumnum_f64_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1181,7 +1181,7 @@ define double @v_maximumnum_f64_v_s(double %x, double inreg %y) { ; GFX9-LABEL: v_maximumnum_f64_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1189,7 +1189,7 @@ define double @v_maximumnum_f64_v_s(double %x, double inreg %y) { ; GFX10-LABEL: v_maximumnum_f64_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX10-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1223,24 +1223,24 @@ define double @v_maximumnum_f64_s_s(double inreg %x, double inreg %y) { ; GFX8-LABEL: v_maximumnum_f64_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], s[16:17], s[16:17] -; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX8-NEXT: v_max_f64 v[0:1], s[18:19], s[18:19] +; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximumnum_f64_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], s[16:17], s[16:17] -; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX9-NEXT: v_max_f64 v[0:1], s[18:19], s[18:19] +; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximumnum_f64_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[16:17] -; GFX10-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX10-NEXT: v_max_f64 v[0:1], s[18:19], s[18:19] +; GFX10-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 8c28fac0d839c2..851c9bb02a3456 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 { ; CHECK-LABEL: memcpy_p0_p0_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v12, s3 ; CHECK-NEXT: v_mov_b32_e32 v11, s2 @@ -34,7 +34,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 { ; CHECK-LABEL: memcpy_p1_p1_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32 @@ -58,7 +58,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p1_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] @@ -96,10 +96,10 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_add_u32 s16, s16, s15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 @@ -162,8 +162,8 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 -; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 +; CHECK-NEXT: s_add_u32 s16, s16, s15 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v26, s0 @@ -175,7 +175,7 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 ; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 ; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 @@ -226,7 +226,7 @@ entry: define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p3_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] @@ -262,7 +262,7 @@ entry: define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-LABEL: memcpy_p0_p3_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v16, 0 ; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 ; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 @@ -293,7 +293,7 @@ entry: define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK-LABEL: memcpy_p0_p0_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v12, s3 ; CHECK-NEXT: v_mov_b32_e32 v11, s2 @@ -319,7 +319,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 { ; CHECK-LABEL: memcpy_p1_p1_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32 @@ -343,7 +343,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p1_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] @@ -381,10 +381,10 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_add_u32 s16, s16, s15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 @@ -447,8 +447,8 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 -; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 +; CHECK-NEXT: s_add_u32 s16, s16, s15 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v26, s0 @@ -460,7 +460,7 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 ; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 ; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 ; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 ; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 @@ -511,7 +511,7 @@ entry: define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p3_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] @@ -547,7 +547,7 @@ entry: define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-LABEL: memcpy_p0_p3_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v16, 0 ; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 ; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll b/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll index 3a6d8ca1e35f60..3fa5ec61829fc4 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll @@ -9,7 +9,7 @@ define void @memcpy_p1_p4_sz16_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p4_sz16_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 @@ -26,15 +26,15 @@ define void @memcpy_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p4_sz31_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s8 -; CHECK-NEXT: v_mov_b32_e32 v3, s9 -; CHECK-NEXT: v_mov_b32_e32 v4, s10 -; CHECK-NEXT: v_mov_b32_e32 v5, s11 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_load_dwordx4 v[2:5], v6, s[6:7] offset:15 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v6, s[16:17] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -47,7 +47,7 @@ define void @memcpy_p1_p4_sz32_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p4_sz32_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[16:17], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 diff --git a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll index b32bfd0e495ba1..8fdecfac109274 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll @@ -9,7 +9,7 @@ define void @memmove_p1_p4_sz16_align_4_4(ptr addrspace(1) align 4 %dst, ptr add ; CHECK-LABEL: memmove_p1_p4_sz16_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 @@ -27,8 +27,8 @@ define void @memmove_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: global_load_ubyte v9, v2, s[6:7] offset:30 -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[16:17], 0x0 +; CHECK-NEXT: global_load_ubyte v9, v2, s[16:17] offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 @@ -53,7 +53,7 @@ define void @memmove_p1_p4_sz32_align_4_4(ptr addrspace(1) align 4 %dst, ptr add ; CHECK-LABEL: memmove_p1_p4_sz32_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[16:17], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s8 ; CHECK-NEXT: v_mov_b32_e32 v3, s9 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 23a4cac25d1aa1..321f572d57cb21 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -15,9 +15,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX7-LABEL: flat_agent_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -30,10 +29,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX10-WGP-LABEL: flat_agent_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -46,10 +43,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX10-CU-LABEL: flat_agent_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -62,9 +57,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; SKIP-CACHE-INV-LABEL: flat_agent_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -77,10 +71,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -91,10 +83,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -105,10 +95,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -119,10 +107,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX940-TGSPLIT-LABEL: flat_agent_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -133,9 +119,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX11-WGP-LABEL: flat_agent_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -148,9 +133,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX11-CU-LABEL: flat_agent_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -163,10 +147,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX12-WGP-LABEL: flat_agent_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -179,10 +161,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX12-CU-LABEL: flat_agent_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -202,9 +182,8 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX7-LABEL: flat_agent_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -217,10 +196,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -233,10 +210,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX10-CU-LABEL: flat_agent_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -249,9 +224,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -264,10 +238,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -278,10 +250,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -292,10 +262,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 @@ -306,10 +274,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 @@ -320,9 +286,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX11-WGP-LABEL: flat_agent_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -335,9 +300,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX11-CU-LABEL: flat_agent_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -350,10 +314,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX12-WGP-LABEL: flat_agent_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -366,10 +328,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX12-CU-LABEL: flat_agent_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -389,9 +349,8 @@ entry: define amdgpu_kernel void @flat_agent_acquire_load( ; GFX7-LABEL: flat_agent_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -405,10 +364,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX10-WGP-LABEL: flat_agent_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -423,10 +380,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX10-CU-LABEL: flat_agent_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -441,9 +396,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -456,10 +410,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -471,10 +423,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -486,10 +436,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 @@ -501,10 +449,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 @@ -516,9 +462,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX11-WGP-LABEL: flat_agent_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -533,9 +478,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX11-CU-LABEL: flat_agent_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -550,10 +494,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX12-WGP-LABEL: flat_agent_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -567,10 +509,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX12-CU-LABEL: flat_agent_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -591,9 +531,8 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX7-LABEL: flat_agent_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -608,10 +547,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -628,10 +565,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -648,9 +583,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -664,10 +598,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -680,10 +612,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -696,10 +626,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -712,10 +640,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -728,9 +654,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -747,9 +672,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX11-CU-LABEL: flat_agent_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -766,10 +690,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_agent_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -789,10 +711,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX12-CU-LABEL: flat_agent_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -819,8 +739,8 @@ entry: define amdgpu_kernel void @flat_agent_unordered_store( ; GFX7-LABEL: flat_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -830,9 +750,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX10-WGP-LABEL: flat_agent_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -842,9 +761,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX10-CU-LABEL: flat_agent_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -854,8 +772,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; SKIP-CACHE-INV-LABEL: flat_agent_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -865,9 +783,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -876,9 +793,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -887,9 +803,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -898,9 +813,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX940-TGSPLIT-LABEL: flat_agent_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -909,8 +823,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX11-WGP-LABEL: flat_agent_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -920,8 +834,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX11-CU-LABEL: flat_agent_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -931,8 +845,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX12-WGP-LABEL: flat_agent_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -942,8 +856,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX12-CU-LABEL: flat_agent_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -959,8 +873,8 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX7-LABEL: flat_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -970,9 +884,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -982,9 +895,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX10-CU-LABEL: flat_agent_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -994,8 +906,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1005,9 +917,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1016,9 +927,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1027,9 +937,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1038,9 +947,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1049,8 +957,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX11-WGP-LABEL: flat_agent_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1060,8 +968,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX11-CU-LABEL: flat_agent_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1071,8 +979,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX12-WGP-LABEL: flat_agent_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1082,8 +990,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX12-CU-LABEL: flat_agent_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1099,8 +1007,8 @@ entry: define amdgpu_kernel void @flat_agent_release_store( ; GFX7-LABEL: flat_agent_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1111,9 +1019,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX10-WGP-LABEL: flat_agent_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1125,9 +1032,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX10-CU-LABEL: flat_agent_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1139,8 +1045,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1151,9 +1057,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1163,9 +1068,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1175,9 +1079,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1188,9 +1091,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1201,8 +1103,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX11-WGP-LABEL: flat_agent_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1214,8 +1116,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX11-CU-LABEL: flat_agent_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1227,8 +1129,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX12-WGP-LABEL: flat_agent_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1242,8 +1144,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX12-CU-LABEL: flat_agent_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1263,8 +1165,8 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX7-LABEL: flat_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1275,9 +1177,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1289,9 +1190,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1303,8 +1203,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1315,9 +1215,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1327,9 +1226,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1339,9 +1237,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1352,9 +1249,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1365,8 +1261,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1378,8 +1274,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX11-CU-LABEL: flat_agent_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1391,8 +1287,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX12-WGP-LABEL: flat_agent_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1406,8 +1302,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX12-CU-LABEL: flat_agent_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1427,9 +1323,8 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX7-LABEL: flat_agent_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1439,10 +1334,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1452,10 +1345,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1465,9 +1356,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1477,10 +1367,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1489,10 +1377,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1501,10 +1387,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1513,10 +1397,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1525,9 +1407,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1537,9 +1418,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1549,10 +1429,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1562,10 +1440,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1581,9 +1457,8 @@ entry: define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX7-LABEL: flat_agent_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1595,10 +1470,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1612,10 +1485,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1629,9 +1500,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1642,10 +1512,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1656,10 +1524,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1670,10 +1536,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1684,10 +1548,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1698,9 +1560,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1714,9 +1575,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1730,10 +1590,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1745,10 +1603,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1766,9 +1622,8 @@ entry: define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX7-LABEL: flat_agent_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1779,10 +1634,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1794,10 +1647,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1809,9 +1660,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1822,10 +1672,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1835,10 +1683,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1848,10 +1694,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1862,10 +1706,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1876,9 +1718,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1890,9 +1731,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1904,10 +1744,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1921,10 +1759,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1944,9 +1780,8 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX7-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1959,10 +1794,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1978,10 +1811,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1997,9 +1828,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -2011,10 +1841,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2026,10 +1854,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2041,10 +1867,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2057,10 +1881,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2073,9 +1895,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2091,9 +1912,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2109,10 +1929,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2128,10 +1946,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2153,9 +1969,8 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX7-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -2168,10 +1983,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -2187,10 +2000,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -2206,9 +2017,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -2220,10 +2030,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2235,10 +2043,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2250,10 +2056,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2266,10 +2070,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2282,9 +2084,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2300,9 +2101,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2318,10 +2118,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2337,10 +2135,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2362,8 +2158,8 @@ entry: define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2378,9 +2174,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2396,9 +2191,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2414,8 +2208,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2429,9 +2223,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2444,9 +2237,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2459,9 +2251,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2474,9 +2265,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2489,8 +2279,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2506,8 +2296,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2523,8 +2313,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2539,8 +2329,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2562,8 +2352,8 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2579,9 +2369,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2599,9 +2388,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2619,8 +2407,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2635,9 +2423,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2651,9 +2438,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2667,9 +2453,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2684,9 +2469,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2701,8 +2485,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2720,8 +2504,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2739,8 +2523,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2761,8 +2545,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2790,8 +2574,8 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2807,9 +2591,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2827,9 +2610,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2847,8 +2629,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2863,9 +2645,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2879,9 +2660,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2895,9 +2675,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2912,9 +2691,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2929,8 +2707,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2948,8 +2726,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2967,8 +2745,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2989,8 +2767,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -3018,7 +2796,7 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3043,7 +2821,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3068,7 +2846,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3093,7 +2871,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3118,7 +2896,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3133,7 +2910,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3148,7 +2924,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3163,7 +2938,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3178,7 +2952,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3194,7 +2967,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3210,8 +2982,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3227,8 +2997,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3251,7 +3019,7 @@ entry: define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3278,7 +3046,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3307,7 +3075,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3336,7 +3104,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3362,7 +3130,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3379,7 +3146,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3396,7 +3162,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3413,7 +3178,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3430,7 +3194,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3450,7 +3213,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3470,8 +3232,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3489,8 +3249,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3515,7 +3273,7 @@ entry: define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3541,7 +3299,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3568,7 +3326,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3595,7 +3353,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3621,7 +3379,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3637,7 +3394,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3653,7 +3409,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3670,7 +3425,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3687,7 +3441,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3705,7 +3458,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3723,8 +3475,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3744,8 +3494,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3772,7 +3520,7 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3800,7 +3548,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3831,7 +3579,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3862,7 +3610,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3889,7 +3637,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3907,7 +3654,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3925,7 +3671,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3944,7 +3689,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3963,7 +3707,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3985,7 +3728,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4007,8 +3749,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4030,8 +3770,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4060,7 +3798,7 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4088,7 +3826,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4119,7 +3857,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4150,7 +3888,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4177,7 +3915,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4195,7 +3932,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4213,7 +3949,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4232,7 +3967,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4251,7 +3985,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4273,7 +4006,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4295,8 +4027,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4318,8 +4048,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4348,7 +4076,7 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4375,7 +4103,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4404,7 +4132,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4433,7 +4161,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4459,7 +4187,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4476,7 +4203,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4493,7 +4219,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4510,7 +4235,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4527,7 +4251,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4547,7 +4270,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4567,8 +4289,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4586,8 +4306,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4612,7 +4330,7 @@ entry: define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4639,7 +4357,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4668,7 +4386,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4697,7 +4415,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4723,7 +4441,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4740,7 +4457,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4757,7 +4473,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4774,7 +4489,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4791,7 +4505,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4811,7 +4524,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4831,8 +4543,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4850,8 +4560,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4876,7 +4584,7 @@ entry: define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4904,7 +4612,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4935,7 +4643,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4966,7 +4674,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4993,7 +4701,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5011,7 +4718,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5029,7 +4735,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5048,7 +4753,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5067,7 +4771,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5089,7 +4792,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5111,8 +4813,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5134,8 +4834,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5164,7 +4862,7 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5192,7 +4890,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5223,7 +4921,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5254,7 +4952,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5281,7 +4979,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5299,7 +4996,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5317,7 +5013,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5336,7 +5031,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5355,7 +5049,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5377,7 +5070,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5399,8 +5091,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5422,8 +5112,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5452,7 +5140,7 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5480,7 +5168,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5511,7 +5199,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5542,7 +5230,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5569,7 +5257,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5587,7 +5274,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5605,7 +5291,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5624,7 +5309,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5643,7 +5327,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5665,7 +5348,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5687,8 +5369,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5710,8 +5390,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5740,7 +5418,7 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5768,7 +5446,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5799,7 +5477,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5830,7 +5508,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5857,7 +5535,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5875,7 +5552,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5893,7 +5569,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5912,7 +5587,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5931,7 +5605,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5953,7 +5626,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5975,8 +5647,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5998,8 +5668,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6028,7 +5696,7 @@ entry: define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6056,7 +5724,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6087,7 +5755,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6118,7 +5786,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -6145,7 +5813,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6163,7 +5830,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6181,7 +5847,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6200,7 +5865,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6219,7 +5883,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6241,7 +5904,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6263,8 +5925,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6286,8 +5946,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6316,7 +5974,7 @@ entry: define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6344,7 +6002,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6375,7 +6033,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6406,7 +6064,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -6433,7 +6091,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6451,7 +6108,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6469,7 +6125,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6488,7 +6143,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6507,7 +6161,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6529,7 +6182,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6551,8 +6203,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6574,8 +6224,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6604,7 +6252,7 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6632,7 +6280,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6663,7 +6311,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6694,7 +6342,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -6721,7 +6369,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6739,7 +6386,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6757,7 +6403,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6776,7 +6421,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6795,7 +6439,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6817,7 +6460,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6839,8 +6481,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6862,8 +6502,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6892,7 +6530,7 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6920,7 +6558,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6951,7 +6589,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6982,7 +6620,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -7009,7 +6647,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7027,7 +6664,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7045,7 +6681,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7064,7 +6699,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7083,7 +6717,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7105,7 +6738,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7127,8 +6759,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7150,8 +6780,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7180,6 +6808,7 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7208,6 +6837,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7236,6 +6866,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7264,6 +6895,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7292,7 +6924,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7310,7 +6941,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7328,7 +6958,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7346,7 +6975,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7364,7 +6992,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7384,7 +7011,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7404,8 +7030,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7425,8 +7049,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7455,6 +7077,7 @@ entry: define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7484,6 +7107,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7514,6 +7138,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7544,6 +7169,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7572,7 +7198,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7591,7 +7216,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7610,7 +7234,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7629,7 +7252,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7648,7 +7270,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7670,7 +7291,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7692,8 +7312,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7714,8 +7332,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7745,6 +7361,7 @@ entry: define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7774,6 +7391,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7804,6 +7422,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7834,6 +7453,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7863,7 +7483,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7882,7 +7501,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7901,7 +7519,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7921,7 +7538,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7941,7 +7557,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7963,7 +7578,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7985,8 +7599,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8010,8 +7622,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8044,6 +7654,7 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8074,6 +7685,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8106,6 +7718,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8138,6 +7751,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8167,7 +7781,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8187,7 +7800,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8207,7 +7819,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8228,7 +7839,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8249,7 +7859,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8273,7 +7882,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8297,8 +7905,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8325,8 +7931,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8362,6 +7966,7 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8392,6 +7997,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8424,6 +8030,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8456,6 +8063,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8485,7 +8093,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8505,7 +8112,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8525,7 +8131,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8546,7 +8151,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8567,7 +8171,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8591,7 +8194,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8615,8 +8217,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8643,8 +8243,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8680,6 +8278,7 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8709,6 +8308,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8739,6 +8339,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8769,6 +8370,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8797,7 +8399,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8816,7 +8417,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8835,7 +8435,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8854,7 +8453,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8873,7 +8471,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8895,7 +8492,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8917,8 +8513,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8941,8 +8535,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8974,6 +8566,7 @@ entry: define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9003,6 +8596,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9033,6 +8627,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9063,6 +8658,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9091,7 +8687,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9110,7 +8705,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9129,7 +8723,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9148,7 +8741,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9167,7 +8759,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9189,7 +8780,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9211,8 +8801,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9233,8 +8821,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9264,6 +8850,7 @@ entry: define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9294,6 +8881,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9326,6 +8914,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9358,6 +8947,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9387,7 +8977,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9407,7 +8996,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9427,7 +9015,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9448,7 +9035,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9469,7 +9055,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9493,7 +9078,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9517,8 +9101,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9545,8 +9127,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9582,6 +9162,7 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9612,6 +9193,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9644,6 +9226,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9676,6 +9259,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9705,7 +9289,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9725,7 +9308,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9745,7 +9327,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9766,7 +9347,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9787,7 +9367,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9811,7 +9390,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9835,8 +9413,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9863,8 +9439,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9900,6 +9474,7 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9930,6 +9505,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9962,6 +9538,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9994,6 +9571,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10023,7 +9601,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10043,7 +9620,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10063,7 +9639,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10084,7 +9659,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10105,7 +9679,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10129,7 +9702,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10153,8 +9725,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10181,8 +9751,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10218,6 +9786,7 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10248,6 +9817,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10280,6 +9850,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10312,6 +9883,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10341,7 +9913,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10361,7 +9932,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10381,7 +9951,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10402,7 +9971,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10423,7 +9991,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10447,7 +10014,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10471,8 +10037,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10499,8 +10063,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10536,6 +10098,7 @@ entry: define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10566,6 +10129,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10598,6 +10162,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10630,6 +10195,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10659,7 +10225,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10679,7 +10244,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10699,7 +10263,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10720,7 +10283,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10741,7 +10303,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10765,7 +10326,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10789,8 +10349,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10815,8 +10373,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10850,6 +10406,7 @@ entry: define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10880,6 +10437,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10912,6 +10470,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10944,6 +10503,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10973,7 +10533,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10993,7 +10552,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11013,7 +10571,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11034,7 +10591,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11055,7 +10611,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11079,7 +10634,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11103,8 +10657,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11131,8 +10683,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11168,6 +10718,7 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11198,6 +10749,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11230,6 +10782,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11262,6 +10815,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -11291,7 +10845,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11311,7 +10864,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11331,7 +10883,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11352,7 +10903,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11373,7 +10923,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11397,7 +10946,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11421,8 +10969,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11449,8 +10995,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11486,6 +11030,7 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11516,6 +11061,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11548,6 +11094,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11580,6 +11127,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -11609,7 +11157,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11629,7 +11176,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11649,7 +11195,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11670,7 +11215,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11691,7 +11235,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11715,7 +11258,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11739,8 +11281,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11767,8 +11307,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11804,9 +11342,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX7-LABEL: flat_agent_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11819,10 +11356,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11835,10 +11370,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11851,9 +11384,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11866,10 +11398,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -11880,10 +11410,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -11894,10 +11422,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -11908,10 +11434,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -11922,9 +11446,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX11-WGP-LABEL: flat_agent_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11937,9 +11460,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX11-CU-LABEL: flat_agent_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11952,10 +11474,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX12-WGP-LABEL: flat_agent_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11968,10 +11488,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX12-CU-LABEL: flat_agent_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11991,9 +11509,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX7-LABEL: flat_agent_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12006,10 +11523,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12022,10 +11537,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12038,9 +11551,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12053,10 +11565,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -12067,10 +11577,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -12081,10 +11589,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 @@ -12095,10 +11601,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 @@ -12109,9 +11613,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12124,9 +11627,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12139,10 +11641,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12155,10 +11655,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12178,9 +11676,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX7-LABEL: flat_agent_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12195,10 +11692,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12214,10 +11709,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12233,9 +11726,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12249,10 +11741,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -12265,10 +11755,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -12280,10 +11768,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 @@ -12296,10 +11782,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 @@ -12311,9 +11795,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12329,9 +11812,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12347,10 +11829,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12365,10 +11845,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX12-CU-LABEL: flat_agent_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12390,9 +11868,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX7-LABEL: flat_agent_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12408,10 +11885,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12429,10 +11904,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12450,9 +11923,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12467,10 +11939,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12484,10 +11954,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12500,10 +11968,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12517,10 +11983,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12533,9 +11997,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12553,9 +12016,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12573,10 +12035,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12597,10 +12057,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12628,8 +12086,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX7-LABEL: flat_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12639,9 +12097,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12651,9 +12108,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12663,8 +12119,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12674,9 +12130,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12685,9 +12140,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12696,9 +12150,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12707,9 +12160,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12718,8 +12170,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX11-WGP-LABEL: flat_agent_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12729,8 +12181,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX11-CU-LABEL: flat_agent_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12740,8 +12192,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX12-WGP-LABEL: flat_agent_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12751,8 +12203,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX12-CU-LABEL: flat_agent_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12768,8 +12220,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX7-LABEL: flat_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12779,9 +12231,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12791,9 +12242,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12803,8 +12253,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12814,9 +12264,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12825,9 +12274,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12836,9 +12284,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12847,9 +12294,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12858,8 +12304,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12869,8 +12315,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12880,8 +12326,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12891,8 +12337,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12908,8 +12354,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX7-LABEL: flat_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12920,9 +12366,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12934,9 +12379,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12948,8 +12392,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12960,9 +12404,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12972,9 +12415,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12984,9 +12426,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12997,9 +12438,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13010,8 +12450,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13023,8 +12463,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX11-CU-LABEL: flat_agent_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13036,8 +12476,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX12-WGP-LABEL: flat_agent_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13051,8 +12491,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX12-CU-LABEL: flat_agent_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13072,8 +12512,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX7-LABEL: flat_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13084,9 +12524,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -13098,9 +12537,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -13112,8 +12550,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -13124,9 +12562,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13136,9 +12573,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13148,9 +12584,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13161,9 +12596,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13174,8 +12608,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13187,8 +12621,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13200,8 +12634,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13215,8 +12649,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13236,9 +12670,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13248,10 +12681,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -13261,10 +12692,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -13274,9 +12703,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -13286,10 +12714,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13298,10 +12724,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13310,10 +12734,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13322,10 +12744,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13334,9 +12754,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13346,9 +12765,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13358,10 +12776,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13371,10 +12787,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13390,9 +12804,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13404,10 +12817,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -13420,10 +12831,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -13436,9 +12845,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -13449,10 +12857,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13463,10 +12869,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13477,10 +12881,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13491,10 +12893,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13505,9 +12905,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13520,9 +12919,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13535,10 +12933,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13550,10 +12946,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13571,9 +12965,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13584,10 +12977,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -13599,10 +12990,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -13614,9 +13003,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -13627,10 +13015,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13640,10 +13026,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13653,10 +13037,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13667,10 +13049,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13681,9 +13061,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13695,9 +13074,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13709,10 +13087,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13726,10 +13102,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13749,9 +13123,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13764,10 +13137,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -13782,10 +13153,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -13800,9 +13169,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -13814,10 +13182,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13829,10 +13195,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13844,10 +13208,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13860,10 +13222,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13876,9 +13236,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13893,9 +13252,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13910,10 +13268,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13929,10 +13285,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13954,9 +13308,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13969,10 +13322,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -13987,10 +13338,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -14005,9 +13354,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -14019,10 +13367,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -14034,10 +13380,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -14049,10 +13393,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -14065,10 +13407,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -14081,9 +13421,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -14098,9 +13437,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -14115,10 +13453,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -14134,10 +13470,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -14159,8 +13493,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -14176,9 +13510,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -14195,9 +13528,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -14214,8 +13546,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -14230,9 +13562,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -14246,9 +13577,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -14261,9 +13591,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -14277,9 +13606,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -14292,8 +13620,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -14310,8 +13638,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -14328,8 +13656,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -14345,8 +13673,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -14369,8 +13697,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -14387,9 +13715,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -14408,9 +13735,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -14429,8 +13755,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -14446,9 +13772,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -14463,9 +13788,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -14479,9 +13803,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -14497,9 +13820,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -14514,8 +13836,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -14534,8 +13856,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -14554,8 +13876,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -14577,8 +13899,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -14607,8 +13929,8 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -14625,9 +13947,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -14646,9 +13967,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -14667,8 +13987,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -14684,9 +14004,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -14701,9 +14020,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -14717,9 +14035,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -14735,9 +14052,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -14752,8 +14068,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -14772,8 +14088,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -14792,8 +14108,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -14815,8 +14131,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -14845,7 +14161,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14870,7 +14186,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14895,7 +14211,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14920,7 +14236,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14945,7 +14261,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14960,7 +14275,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14975,7 +14289,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14990,7 +14303,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15005,7 +14317,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15021,7 +14332,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15037,8 +14347,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15054,8 +14362,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15078,7 +14384,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15105,7 +14411,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15133,7 +14439,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15161,7 +14467,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15187,7 +14493,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15204,7 +14509,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15221,7 +14525,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15238,7 +14541,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15255,7 +14557,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15274,7 +14575,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15293,8 +14593,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15312,8 +14610,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15338,7 +14634,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15364,7 +14660,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15391,7 +14687,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15418,7 +14714,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15444,7 +14740,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15460,7 +14755,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15476,7 +14770,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15493,7 +14786,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15510,7 +14802,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15528,7 +14819,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15546,8 +14836,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15567,8 +14855,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15595,7 +14881,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15623,7 +14909,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15653,7 +14939,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15683,7 +14969,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15710,7 +14996,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15728,7 +15013,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15746,7 +15030,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15765,7 +15048,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15784,7 +15066,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15805,7 +15086,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15826,8 +15106,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15849,8 +15127,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15879,7 +15155,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15907,7 +15183,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15937,7 +15213,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15967,7 +15243,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15994,7 +15270,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16012,7 +15287,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16030,7 +15304,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16049,7 +15322,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16068,7 +15340,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16089,7 +15360,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16110,8 +15380,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16133,8 +15401,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16163,7 +15429,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16190,7 +15456,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16218,7 +15484,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16246,7 +15512,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16272,7 +15538,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16289,7 +15554,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16306,7 +15570,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16323,7 +15586,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16340,7 +15602,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16359,7 +15620,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16378,8 +15638,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16397,8 +15655,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16423,7 +15679,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16450,7 +15706,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16478,7 +15734,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16506,7 +15762,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16532,7 +15788,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16549,7 +15804,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16566,7 +15820,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16583,7 +15836,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16600,7 +15852,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16619,7 +15870,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16638,8 +15888,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16657,8 +15905,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16683,7 +15929,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16711,7 +15957,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16741,7 +15987,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16771,7 +16017,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16798,7 +16044,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16816,7 +16061,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16834,7 +16078,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16853,7 +16096,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16872,7 +16114,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16893,7 +16134,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16914,8 +16154,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16937,8 +16175,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16967,7 +16203,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16995,7 +16231,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17025,7 +16261,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17055,7 +16291,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -17082,7 +16318,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17100,7 +16335,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17118,7 +16352,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17137,7 +16370,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17156,7 +16388,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17177,7 +16408,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17198,8 +16428,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17221,8 +16449,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17251,7 +16477,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17279,7 +16505,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17309,7 +16535,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17339,7 +16565,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -17366,7 +16592,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17384,7 +16609,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17402,7 +16626,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17421,7 +16644,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17440,7 +16662,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17461,7 +16682,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17482,8 +16702,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17505,8 +16723,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17535,7 +16751,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17563,7 +16779,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17593,7 +16809,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17623,7 +16839,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -17650,7 +16866,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17668,7 +16883,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17686,7 +16900,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17705,7 +16918,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17724,7 +16936,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17745,7 +16956,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17766,8 +16976,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17789,8 +16997,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17819,7 +17025,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17847,7 +17053,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17877,7 +17083,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17907,7 +17113,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -17934,7 +17140,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17952,7 +17157,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17970,7 +17174,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17989,7 +17192,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18008,7 +17210,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18029,7 +17230,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18050,8 +17250,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18073,8 +17271,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18103,7 +17299,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -18131,7 +17327,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -18161,7 +17357,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -18191,7 +17387,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -18218,7 +17414,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18236,7 +17431,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18254,7 +17448,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18273,7 +17466,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18292,7 +17484,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18313,7 +17504,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18334,8 +17524,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18357,8 +17545,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18387,7 +17573,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -18415,7 +17601,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -18445,7 +17631,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -18475,7 +17661,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -18502,7 +17688,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18520,7 +17705,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18538,7 +17722,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18557,7 +17740,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18576,7 +17758,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18597,7 +17778,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18618,8 +17798,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18641,8 +17819,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18671,7 +17847,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -18699,7 +17875,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -18729,7 +17905,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -18759,7 +17935,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -18786,7 +17962,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18804,7 +17979,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18822,7 +17996,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18841,7 +18014,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18860,7 +18032,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18881,7 +18052,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18902,8 +18072,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18925,8 +18093,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18955,6 +18121,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18983,6 +18150,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19011,6 +18179,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19039,6 +18208,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19067,7 +18237,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19085,7 +18254,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19103,7 +18271,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19121,7 +18288,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19139,7 +18305,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19159,7 +18324,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19179,8 +18343,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19200,8 +18362,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19230,6 +18390,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19260,6 +18421,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19291,6 +18453,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19322,6 +18485,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19351,7 +18515,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19371,7 +18534,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19390,7 +18552,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19410,7 +18571,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19429,7 +18589,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19452,7 +18611,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19475,8 +18633,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19498,8 +18654,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19530,6 +18684,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19559,6 +18714,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19589,6 +18745,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19619,6 +18776,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19648,7 +18806,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19667,7 +18824,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19686,7 +18842,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19706,7 +18861,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19726,7 +18880,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19748,7 +18901,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19770,8 +18922,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19795,8 +18945,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19829,6 +18977,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19860,6 +19009,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19893,6 +19043,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19926,6 +19077,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19956,7 +19108,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19977,7 +19128,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19997,7 +19147,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20019,7 +19168,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20040,7 +19188,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20065,7 +19212,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20090,8 +19236,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20119,8 +19263,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20157,6 +19299,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20188,6 +19331,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20221,6 +19365,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20254,6 +19399,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20284,7 +19430,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20305,7 +19450,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20325,7 +19469,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20347,7 +19490,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20368,7 +19510,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20393,7 +19534,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20418,8 +19558,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20447,8 +19585,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20485,6 +19621,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20515,6 +19652,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20546,6 +19684,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20577,6 +19716,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20606,7 +19746,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20626,7 +19765,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20645,7 +19783,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20665,7 +19802,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20684,7 +19820,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20707,7 +19842,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20730,8 +19864,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20755,8 +19887,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20789,6 +19919,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20819,6 +19950,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20850,6 +19982,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20881,6 +20014,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20910,7 +20044,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20930,7 +20063,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20949,7 +20081,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20969,7 +20100,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20988,7 +20118,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21011,7 +20140,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21034,8 +20162,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21057,8 +20183,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21089,6 +20213,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21120,6 +20245,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21153,6 +20279,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21186,6 +20313,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21216,7 +20344,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21237,7 +20364,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21257,7 +20383,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21279,7 +20404,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21300,7 +20424,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21325,7 +20448,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21350,8 +20472,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21379,8 +20499,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21417,6 +20535,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21448,6 +20567,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21481,6 +20601,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21514,6 +20635,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21544,7 +20666,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21565,7 +20686,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21585,7 +20705,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21607,7 +20726,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21628,7 +20746,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21653,7 +20770,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21678,8 +20794,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21707,8 +20821,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21745,6 +20857,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21776,6 +20889,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21809,6 +20923,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21842,6 +20957,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21872,7 +20988,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21893,7 +21008,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21913,7 +21027,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21935,7 +21048,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21956,7 +21068,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21981,7 +21092,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22006,8 +21116,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22035,8 +21143,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22073,6 +21179,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22104,6 +21211,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22137,6 +21245,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22170,6 +21279,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -22200,7 +21310,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22221,7 +21330,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22241,7 +21349,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22263,7 +21370,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22284,7 +21390,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22309,7 +21414,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22334,8 +21438,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22363,8 +21465,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22401,6 +21501,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22432,6 +21533,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22465,6 +21567,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22498,6 +21601,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -22528,7 +21632,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22549,7 +21652,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22569,7 +21671,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22591,7 +21692,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22612,7 +21712,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22637,7 +21736,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22662,8 +21760,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22689,8 +21785,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22725,6 +21819,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22756,6 +21851,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22789,6 +21885,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22822,6 +21919,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -22852,7 +21950,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22873,7 +21970,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22893,7 +21989,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22915,7 +22010,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22936,7 +22030,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22961,7 +22054,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22986,8 +22078,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23015,8 +22105,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23053,6 +22141,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23084,6 +22173,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23117,6 +22207,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23150,6 +22241,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -23180,7 +22272,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23201,7 +22292,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23221,7 +22311,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23243,7 +22332,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23264,7 +22352,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23289,7 +22376,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23314,8 +22400,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23343,8 +22427,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23381,6 +22463,7 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23412,6 +22495,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23445,6 +22529,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23478,6 +22563,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -23508,7 +22594,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23529,7 +22614,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23549,7 +22633,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23571,7 +22654,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23592,7 +22674,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23617,7 +22698,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23642,8 +22722,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23671,8 +22749,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index 5c59481c598539..ab485b17994704 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -5,10 +5,8 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_load_0: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v1, s3 @@ -27,8 +25,10 @@ entry: define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_load_1: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_and_b32_e64 v0, v0, s2 @@ -69,10 +69,8 @@ entry: define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_and_volatile_load: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v1, s3 @@ -94,10 +92,8 @@ entry: define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_and_nontemporal_load: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index b2340caa2933f9..ddc4673a290fed 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -15,9 +15,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -30,10 +29,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -46,10 +43,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-CU-LABEL: flat_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -62,9 +57,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -77,10 +71,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc @@ -91,10 +83,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc slc @@ -105,10 +95,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt @@ -119,10 +107,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt @@ -133,9 +119,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX11-WGP-LABEL: flat_nontemporal_load_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -148,9 +133,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX11-CU-LABEL: flat_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -163,10 +147,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX12-WGP-LABEL: flat_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -179,10 +161,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX12-CU-LABEL: flat_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -202,8 +182,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-LABEL: flat_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -230,8 +211,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_nop 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s6, v0 ; GFX10-WGP-NEXT: s_mov_b32 s6, 0 @@ -257,8 +240,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-CU-LABEL: flat_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_nop 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s6, v0 ; GFX10-CU-NEXT: s_mov_b32 s6, 0 @@ -284,8 +269,9 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0 @@ -312,8 +298,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2 @@ -341,8 +329,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_nop 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2 @@ -370,10 +360,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0x3ff ; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2 @@ -393,10 +381,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s4, 0x3ff ; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s4, 2 @@ -416,8 +402,9 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX11-WGP-LABEL: flat_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX11-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX11-WGP-NEXT: s_mov_b32 s2, 2 @@ -445,8 +432,9 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX11-CU-LABEL: flat_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX11-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX11-CU-NEXT: s_mov_b32 s2, 2 @@ -474,8 +462,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX12-WGP-LABEL: flat_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe +; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 @@ -508,8 +498,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX12-CU-LABEL: flat_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX12-CU-NEXT: s_wait_alu 0xfffe +; GFX12-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 @@ -551,9 +543,8 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX7-LABEL: flat_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -566,10 +557,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -582,10 +571,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-CU-LABEL: flat_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -598,9 +585,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -613,10 +599,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -627,10 +611,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -641,10 +623,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -655,10 +635,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -669,9 +647,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX11-WGP-LABEL: flat_nontemporal_store_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -684,9 +661,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX11-CU-LABEL: flat_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -699,10 +675,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX12-WGP-LABEL: flat_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -715,10 +689,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX12-CU-LABEL: flat_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -738,8 +710,8 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-LABEL: flat_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 @@ -766,9 +738,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 @@ -794,9 +765,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-CU-LABEL: flat_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 @@ -822,8 +792,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 @@ -850,9 +820,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3] @@ -880,9 +849,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[2:3] @@ -910,10 +878,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3] @@ -933,10 +899,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[2:3] @@ -956,8 +920,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX11-WGP-LABEL: flat_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -985,8 +949,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX11-CU-LABEL: flat_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1014,8 +978,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX12-WGP-LABEL: flat_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1048,8 +1012,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX12-CU-LABEL: flat_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1091,9 +1055,8 @@ entry: define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX7-LABEL: flat_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1107,10 +1070,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1124,10 +1085,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: flat_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1141,9 +1100,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1157,10 +1115,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -1172,10 +1128,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -1186,10 +1140,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 @@ -1201,10 +1153,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX940-TGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 @@ -1215,9 +1165,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX11-WGP-LABEL: flat_nontemporal_volatile_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1231,9 +1180,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX11-CU-LABEL: flat_nontemporal_volatile_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1247,10 +1195,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX12-WGP-LABEL: flat_nontemporal_volatile_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1266,10 +1212,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX12-CU-LABEL: flat_nontemporal_volatile_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index 304c80d7bb24d4..4fa15c194adf6a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -15,9 +15,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX7-LABEL: flat_singlethread_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -30,10 +29,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX10-WGP-LABEL: flat_singlethread_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -46,10 +43,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX10-CU-LABEL: flat_singlethread_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -62,9 +57,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -77,10 +71,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -91,10 +83,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -105,10 +95,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -119,10 +107,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -133,9 +119,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX11-WGP-LABEL: flat_singlethread_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -148,9 +133,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX11-CU-LABEL: flat_singlethread_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -163,10 +147,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX12-WGP-LABEL: flat_singlethread_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -179,10 +161,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX12-CU-LABEL: flat_singlethread_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -202,9 +182,8 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX7-LABEL: flat_singlethread_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -217,10 +196,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -233,10 +210,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -249,9 +224,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -264,10 +238,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -278,10 +250,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -292,10 +262,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -306,10 +274,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -320,9 +286,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -335,9 +300,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -350,10 +314,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -366,10 +328,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX12-CU-LABEL: flat_singlethread_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -389,9 +349,8 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX7-LABEL: flat_singlethread_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -404,10 +363,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -420,10 +377,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -436,9 +391,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -451,10 +405,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -465,10 +417,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -479,10 +429,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -493,10 +441,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -507,9 +453,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -522,9 +467,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX11-CU-LABEL: flat_singlethread_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -537,10 +481,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -553,10 +495,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX12-CU-LABEL: flat_singlethread_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -576,9 +516,8 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX7-LABEL: flat_singlethread_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -591,10 +530,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -607,10 +544,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -623,9 +558,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -638,10 +572,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -652,10 +584,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -666,10 +596,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -680,10 +608,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -694,9 +620,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -709,9 +634,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -724,10 +648,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -740,10 +662,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX12-CU-LABEL: flat_singlethread_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -763,8 +683,8 @@ entry: define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX7-LABEL: flat_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -774,9 +694,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX10-WGP-LABEL: flat_singlethread_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -786,9 +705,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX10-CU-LABEL: flat_singlethread_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -798,8 +716,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -809,9 +727,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -820,9 +737,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -831,9 +747,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -842,9 +757,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -853,8 +767,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX11-WGP-LABEL: flat_singlethread_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -864,8 +778,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX11-CU-LABEL: flat_singlethread_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -875,8 +789,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX12-WGP-LABEL: flat_singlethread_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -886,8 +800,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX12-CU-LABEL: flat_singlethread_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -903,8 +817,8 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX7-LABEL: flat_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -914,9 +828,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -926,9 +839,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -938,8 +850,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -949,9 +861,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -960,9 +871,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -971,9 +881,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -982,9 +891,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -993,8 +901,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1004,8 +912,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1015,8 +923,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1026,8 +934,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX12-CU-LABEL: flat_singlethread_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1043,8 +951,8 @@ entry: define amdgpu_kernel void @flat_singlethread_release_store( ; GFX7-LABEL: flat_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1054,9 +962,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX10-WGP-LABEL: flat_singlethread_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1066,9 +973,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX10-CU-LABEL: flat_singlethread_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1078,8 +984,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1089,9 +995,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1100,9 +1005,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1111,9 +1015,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1122,9 +1025,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1133,8 +1035,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX11-WGP-LABEL: flat_singlethread_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1144,8 +1046,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX11-CU-LABEL: flat_singlethread_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1155,8 +1057,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX12-WGP-LABEL: flat_singlethread_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1166,8 +1068,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX12-CU-LABEL: flat_singlethread_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1183,8 +1085,8 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX7-LABEL: flat_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1194,9 +1096,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1206,9 +1107,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1218,8 +1118,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1229,9 +1129,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1240,9 +1139,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1251,9 +1149,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1262,9 +1159,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1273,8 +1169,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1284,8 +1180,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1295,8 +1191,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1306,8 +1202,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX12-CU-LABEL: flat_singlethread_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1323,9 +1219,8 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1335,10 +1230,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1348,10 +1241,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1361,9 +1252,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1373,10 +1263,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1385,10 +1273,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1397,10 +1283,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1409,10 +1293,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1421,9 +1303,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1433,9 +1314,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1445,10 +1325,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1458,10 +1336,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1477,9 +1353,8 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX7-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1489,10 +1364,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1502,10 +1375,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1515,9 +1386,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1527,10 +1397,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1539,10 +1407,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1551,10 +1417,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1563,10 +1427,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1575,9 +1437,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1587,9 +1448,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1599,10 +1459,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1612,10 +1470,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1631,9 +1487,8 @@ entry: define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX7-LABEL: flat_singlethread_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1643,10 +1498,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1656,10 +1509,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1669,9 +1520,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1681,10 +1531,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1693,10 +1541,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1705,10 +1551,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1717,10 +1561,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1729,9 +1571,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1741,9 +1582,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1753,10 +1593,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1766,10 +1604,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1785,9 +1621,8 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1797,10 +1632,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1810,10 +1643,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1823,9 +1654,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1835,10 +1665,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1847,10 +1675,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1859,10 +1685,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1871,10 +1695,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1883,9 +1705,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1895,9 +1716,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1907,10 +1727,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1920,10 +1738,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1939,9 +1755,8 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1951,10 +1766,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1964,10 +1777,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1977,9 +1788,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1989,10 +1799,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2001,10 +1809,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2013,10 +1819,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2025,10 +1829,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2037,9 +1839,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2049,9 +1850,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2061,10 +1861,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2074,10 +1872,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2093,8 +1889,8 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2108,9 +1904,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2124,9 +1919,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2140,8 +1934,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2155,9 +1949,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2169,9 +1962,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2183,9 +1975,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2197,9 +1988,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2211,8 +2001,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2226,8 +2016,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2241,8 +2031,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2256,8 +2046,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2278,8 +2068,8 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2293,9 +2083,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2309,9 +2098,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2325,8 +2113,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2340,9 +2128,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2354,9 +2141,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2368,9 +2154,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2382,9 +2167,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2396,8 +2180,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2411,8 +2195,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2426,8 +2210,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2441,8 +2225,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2463,8 +2247,8 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2478,9 +2262,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2494,9 +2277,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2510,8 +2292,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2525,9 +2307,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2539,9 +2320,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2553,9 +2333,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2567,9 +2346,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2581,8 +2359,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2596,8 +2374,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2611,8 +2389,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2626,8 +2404,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2648,7 +2426,7 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -2673,7 +2451,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -2698,7 +2476,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -2723,7 +2501,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -2748,7 +2526,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2763,7 +2540,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2778,7 +2554,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -2793,7 +2568,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -2808,7 +2582,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -2824,7 +2597,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -2840,8 +2612,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -2857,8 +2627,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -2881,7 +2649,7 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -2906,7 +2674,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -2931,7 +2699,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -2956,7 +2724,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -2981,7 +2749,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2996,7 +2763,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3011,7 +2777,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3026,7 +2791,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3041,7 +2805,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3057,7 +2820,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3073,8 +2835,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3090,8 +2850,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3114,7 +2872,7 @@ entry: define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3139,7 +2897,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3164,7 +2922,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3189,7 +2947,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3214,7 +2972,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3229,7 +2986,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3244,7 +3000,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3259,7 +3014,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3274,7 +3028,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3290,7 +3043,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3306,8 +3058,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3323,8 +3073,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3347,7 +3095,7 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3372,7 +3120,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3397,7 +3145,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3422,7 +3170,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3447,7 +3195,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3462,7 +3209,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3477,7 +3223,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3492,7 +3237,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3507,7 +3251,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3523,7 +3266,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3539,8 +3281,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3556,8 +3296,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3580,7 +3318,7 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3605,7 +3343,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3630,7 +3368,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3655,7 +3393,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3680,7 +3418,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3695,7 +3432,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3710,7 +3446,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3725,7 +3460,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3740,7 +3474,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3756,7 +3489,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3772,8 +3504,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3789,8 +3519,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3813,7 +3541,7 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3838,7 +3566,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3863,7 +3591,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3888,7 +3616,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3913,7 +3641,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3928,7 +3655,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3943,7 +3669,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3958,7 +3683,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3973,7 +3697,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3989,7 +3712,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4005,8 +3727,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4022,8 +3742,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4046,7 +3764,7 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4071,7 +3789,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4096,7 +3814,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4121,7 +3839,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4146,7 +3864,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4161,7 +3878,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4176,7 +3892,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4191,7 +3906,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4206,7 +3920,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4222,7 +3935,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4238,8 +3950,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4255,8 +3965,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4279,7 +3987,7 @@ entry: define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4304,7 +4012,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4329,7 +4037,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4354,7 +4062,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4379,7 +4087,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4394,7 +4101,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4409,7 +4115,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4424,7 +4129,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4439,7 +4143,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4455,7 +4158,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4471,8 +4173,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4488,8 +4188,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4512,7 +4210,7 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4537,7 +4235,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4562,7 +4260,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4587,7 +4285,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4612,7 +4310,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4627,7 +4324,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4642,7 +4338,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4657,7 +4352,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4672,7 +4366,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4688,7 +4381,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4704,8 +4396,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4721,8 +4411,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4745,7 +4433,7 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4770,7 +4458,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4795,7 +4483,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4820,7 +4508,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4845,7 +4533,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4860,7 +4547,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4875,7 +4561,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4890,7 +4575,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4905,7 +4589,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4921,7 +4604,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4937,8 +4619,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4954,8 +4634,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4978,7 +4656,7 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5003,7 +4681,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5028,7 +4706,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5053,7 +4731,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5078,7 +4756,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5093,7 +4770,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5108,7 +4784,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5123,7 +4798,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5138,7 +4812,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5154,7 +4827,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5170,8 +4842,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5187,8 +4857,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5211,7 +4879,7 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5236,7 +4904,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5261,7 +4929,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5286,7 +4954,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5311,7 +4979,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5326,7 +4993,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5341,7 +5007,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5356,7 +5021,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5371,7 +5035,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5387,7 +5050,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5403,8 +5065,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5420,8 +5080,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5444,7 +5102,7 @@ entry: define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5469,7 +5127,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5494,7 +5152,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5519,7 +5177,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5544,7 +5202,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5559,7 +5216,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5574,7 +5230,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5589,7 +5244,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5604,7 +5258,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5620,7 +5273,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5636,8 +5288,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5653,8 +5303,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5677,7 +5325,7 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5702,7 +5350,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5727,7 +5375,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5752,7 +5400,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5777,7 +5425,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5792,7 +5439,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5807,7 +5453,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5822,7 +5467,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5837,7 +5481,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5853,7 +5496,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5869,8 +5511,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5886,8 +5526,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5910,7 +5548,7 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5935,7 +5573,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5960,7 +5598,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5985,7 +5623,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -6010,7 +5648,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6025,7 +5662,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6040,7 +5676,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6055,7 +5690,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6070,7 +5704,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6086,7 +5719,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6102,8 +5734,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6119,8 +5749,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6143,6 +5771,7 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6171,6 +5800,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6199,6 +5829,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6227,6 +5858,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6255,7 +5887,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6273,7 +5904,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6291,7 +5921,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6309,7 +5938,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6327,7 +5955,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6347,7 +5974,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6367,8 +5993,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6388,8 +6012,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6418,6 +6040,7 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6446,6 +6069,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6474,6 +6098,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6502,6 +6127,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6530,7 +6156,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6548,7 +6173,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6566,7 +6190,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6584,7 +6207,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6602,7 +6224,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6622,7 +6243,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6642,8 +6262,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6663,8 +6281,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6693,6 +6309,7 @@ entry: define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6721,6 +6338,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6749,6 +6367,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6777,6 +6396,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6805,7 +6425,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6823,7 +6442,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6841,7 +6459,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6859,7 +6476,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6877,7 +6493,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6897,7 +6512,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6917,8 +6531,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6938,8 +6550,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6968,6 +6578,7 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6996,6 +6607,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7024,6 +6636,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7052,6 +6665,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7080,7 +6694,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7098,7 +6711,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7116,7 +6728,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7134,7 +6745,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7152,7 +6762,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7172,7 +6781,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7192,8 +6800,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7213,8 +6819,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7243,6 +6847,7 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7271,6 +6876,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7299,6 +6905,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7327,6 +6934,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7355,7 +6963,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7373,7 +6980,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7391,7 +6997,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7409,7 +7014,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7427,7 +7031,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7447,7 +7050,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7467,8 +7069,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7488,8 +7088,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7518,6 +7116,7 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7546,6 +7145,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7574,6 +7174,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7602,6 +7203,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7630,7 +7232,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7648,7 +7249,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7666,7 +7266,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7684,7 +7283,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7702,7 +7300,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7722,7 +7319,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7742,8 +7338,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7763,8 +7357,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7793,6 +7385,7 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7821,6 +7414,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7849,6 +7443,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7877,6 +7472,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7905,7 +7501,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7923,7 +7518,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7941,7 +7535,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7959,7 +7552,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7977,7 +7569,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7997,7 +7588,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8017,8 +7607,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8038,8 +7626,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8068,6 +7654,7 @@ entry: define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8096,6 +7683,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8124,6 +7712,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8152,6 +7741,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8180,7 +7770,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8198,7 +7787,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8216,7 +7804,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8234,7 +7821,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8252,7 +7838,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8272,7 +7857,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8292,8 +7876,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8313,8 +7895,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8343,6 +7923,7 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8371,6 +7952,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8399,6 +7981,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8427,6 +8010,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8455,7 +8039,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8473,7 +8056,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8491,7 +8073,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8509,7 +8090,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8527,7 +8107,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8547,7 +8126,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8567,8 +8145,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8588,8 +8164,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8618,6 +8192,7 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8646,6 +8221,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8674,6 +8250,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8702,6 +8279,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8730,7 +8308,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8748,7 +8325,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8766,7 +8342,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8784,7 +8359,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8802,7 +8376,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8822,7 +8395,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8842,8 +8414,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8863,8 +8433,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8893,6 +8461,7 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8921,6 +8490,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8949,6 +8519,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8977,6 +8548,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9005,7 +8577,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9023,7 +8594,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9041,7 +8611,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9059,7 +8628,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9077,7 +8645,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9097,7 +8664,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9117,8 +8683,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9138,8 +8702,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9168,6 +8730,7 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9196,6 +8759,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9224,6 +8788,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9252,6 +8817,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9280,7 +8846,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9298,7 +8863,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9316,7 +8880,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9334,7 +8897,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9352,7 +8914,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9372,7 +8933,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9392,8 +8952,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9413,8 +8971,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9443,6 +8999,7 @@ entry: define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9471,6 +9028,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9499,6 +9057,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9527,6 +9086,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9555,7 +9115,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9573,7 +9132,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9591,7 +9149,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9609,7 +9166,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9627,7 +9183,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9647,7 +9202,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9667,8 +9221,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9688,8 +9240,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9718,6 +9268,7 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9746,6 +9297,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9774,6 +9326,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9802,6 +9355,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9830,7 +9384,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9848,7 +9401,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9866,7 +9418,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9884,7 +9435,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9902,7 +9452,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9922,7 +9471,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9942,8 +9490,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9963,8 +9509,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9993,6 +9537,7 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10021,6 +9566,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10049,6 +9595,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10077,6 +9624,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10105,7 +9653,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10123,7 +9670,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10141,7 +9687,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10159,7 +9704,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10177,7 +9721,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10197,7 +9740,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10217,8 +9759,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10238,8 +9778,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10268,9 +9806,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX7-LABEL: flat_singlethread_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10283,10 +9820,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -10299,10 +9834,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -10315,9 +9848,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -10330,10 +9862,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10344,10 +9874,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10358,10 +9886,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10372,10 +9898,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10386,9 +9910,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10401,9 +9924,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10416,10 +9938,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10432,10 +9952,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10455,9 +9973,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10470,10 +9987,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -10486,10 +10001,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -10502,9 +10015,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -10517,10 +10029,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10531,10 +10041,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10545,10 +10053,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10559,10 +10065,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10573,9 +10077,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10588,9 +10091,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10603,10 +10105,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10619,10 +10119,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10642,9 +10140,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX7-LABEL: flat_singlethread_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10657,10 +10154,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -10673,10 +10168,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -10689,9 +10182,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -10704,10 +10196,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10718,10 +10208,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10732,10 +10220,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10746,10 +10232,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10760,9 +10244,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10775,9 +10258,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10790,10 +10272,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10806,10 +10286,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10829,9 +10307,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10844,10 +10321,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -10860,10 +10335,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -10876,9 +10349,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -10891,10 +10363,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10905,10 +10375,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10919,10 +10387,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10933,10 +10399,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10947,9 +10411,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10962,9 +10425,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10977,10 +10439,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10993,10 +10453,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11016,8 +10474,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX7-LABEL: flat_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11027,9 +10485,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11039,9 +10496,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11051,8 +10507,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11062,9 +10518,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11073,9 +10528,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11084,9 +10538,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11095,9 +10548,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11106,8 +10558,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11117,8 +10569,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11128,8 +10580,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11139,8 +10591,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11156,8 +10608,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11167,9 +10619,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11179,9 +10630,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11191,8 +10641,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11202,9 +10652,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11213,9 +10662,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11224,9 +10672,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11235,9 +10682,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11246,8 +10692,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11257,8 +10703,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11268,8 +10714,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11279,8 +10725,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11296,8 +10742,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX7-LABEL: flat_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11307,9 +10753,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11319,9 +10764,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11331,8 +10775,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11342,9 +10786,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11353,9 +10796,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11364,9 +10806,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11375,9 +10816,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11386,8 +10826,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11397,8 +10837,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11408,8 +10848,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11419,8 +10859,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11436,8 +10876,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11447,9 +10887,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11459,9 +10898,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11471,8 +10909,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11482,9 +10920,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11493,9 +10930,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11504,9 +10940,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11515,9 +10950,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11526,8 +10960,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11537,8 +10971,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11548,8 +10982,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11559,8 +10993,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11576,9 +11010,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11588,10 +11021,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11601,10 +11032,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11614,9 +11043,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11626,10 +11054,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11638,10 +11064,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11650,10 +11074,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11662,10 +11084,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11674,9 +11094,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11686,9 +11105,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11698,10 +11116,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11711,10 +11127,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11730,9 +11144,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11742,10 +11155,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11755,10 +11166,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11768,9 +11177,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11780,10 +11188,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11792,10 +11198,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11804,10 +11208,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11816,10 +11218,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11828,9 +11228,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11840,9 +11239,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11852,10 +11250,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11865,10 +11261,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11884,9 +11278,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11896,10 +11289,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11909,10 +11300,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11922,9 +11311,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11934,10 +11322,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11946,10 +11332,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11958,10 +11342,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11970,10 +11352,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11982,9 +11362,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11994,9 +11373,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12006,10 +11384,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12019,10 +11395,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12038,9 +11412,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12050,10 +11423,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12063,10 +11434,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12076,9 +11445,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12088,10 +11456,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12100,10 +11466,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12112,10 +11476,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12124,10 +11486,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12136,9 +11496,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12148,9 +11507,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12160,10 +11518,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12173,10 +11529,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12192,9 +11546,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12204,10 +11557,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12217,10 +11568,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12230,9 +11579,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12242,10 +11590,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12254,10 +11600,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12266,10 +11610,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12278,10 +11620,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12290,9 +11630,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12302,9 +11641,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12314,10 +11652,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12327,10 +11663,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12346,8 +11680,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12361,9 +11695,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -12377,9 +11710,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -12393,8 +11725,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -12408,9 +11740,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12422,9 +11753,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12436,9 +11766,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12450,9 +11779,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12464,8 +11792,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12479,8 +11807,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12494,8 +11822,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12509,8 +11837,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12531,8 +11859,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12546,9 +11874,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -12562,9 +11889,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -12578,8 +11904,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -12593,9 +11919,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12607,9 +11932,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12621,9 +11945,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12635,9 +11958,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12649,8 +11971,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12664,8 +11986,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12679,8 +12001,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12694,8 +12016,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12716,8 +12038,8 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12731,9 +12053,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -12747,9 +12068,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -12763,8 +12083,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -12778,9 +12098,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12792,9 +12111,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12806,9 +12124,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12820,9 +12137,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12834,8 +12150,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12849,8 +12165,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12864,8 +12180,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12879,8 +12195,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12901,7 +12217,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -12926,7 +12242,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -12951,7 +12267,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -12976,7 +12292,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13001,7 +12317,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13016,7 +12331,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13031,7 +12345,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13046,7 +12359,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13061,7 +12373,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13077,7 +12388,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13093,8 +12403,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13110,8 +12418,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13134,7 +12440,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13159,7 +12465,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13184,7 +12490,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13209,7 +12515,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13234,7 +12540,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13249,7 +12554,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13264,7 +12568,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13279,7 +12582,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13294,7 +12596,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13310,7 +12611,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13326,8 +12626,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13343,8 +12641,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13367,7 +12663,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13392,7 +12688,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13417,7 +12713,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13442,7 +12738,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13467,7 +12763,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13482,7 +12777,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13497,7 +12791,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13512,7 +12805,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13527,7 +12819,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13543,7 +12834,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13559,8 +12849,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13576,8 +12864,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13600,7 +12886,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13625,7 +12911,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13650,7 +12936,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13675,7 +12961,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13700,7 +12986,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13715,7 +13000,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13730,7 +13014,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13745,7 +13028,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13760,7 +13042,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13776,7 +13057,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13792,8 +13072,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13809,8 +13087,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13833,7 +13109,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13858,7 +13134,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13883,7 +13159,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13908,7 +13184,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13933,7 +13209,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13948,7 +13223,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13963,7 +13237,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13978,7 +13251,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13993,7 +13265,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14009,7 +13280,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14025,8 +13295,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14042,8 +13310,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14066,7 +13332,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14091,7 +13357,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14116,7 +13382,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14141,7 +13407,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14166,7 +13432,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14181,7 +13446,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14196,7 +13460,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14211,7 +13474,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14226,7 +13488,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14242,7 +13503,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14258,8 +13518,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14275,8 +13533,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14299,7 +13555,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14324,7 +13580,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14349,7 +13605,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14374,7 +13630,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14399,7 +13655,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14414,7 +13669,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14429,7 +13683,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14444,7 +13697,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14459,7 +13711,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14475,7 +13726,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14491,8 +13741,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14508,8 +13756,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14532,7 +13778,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14557,7 +13803,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14582,7 +13828,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14607,7 +13853,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14632,7 +13878,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14647,7 +13892,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14662,7 +13906,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14677,7 +13920,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14692,7 +13934,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14708,7 +13949,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14724,8 +13964,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14741,8 +13979,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14765,7 +14001,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14790,7 +14026,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14815,7 +14051,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14840,7 +14076,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14865,7 +14101,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14880,7 +14115,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14895,7 +14129,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14910,7 +14143,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14925,7 +14157,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14941,7 +14172,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14957,8 +14187,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14974,8 +14202,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14998,7 +14224,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15023,7 +14249,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15048,7 +14274,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15073,7 +14299,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15098,7 +14324,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15113,7 +14338,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15128,7 +14352,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15143,7 +14366,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15158,7 +14380,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15174,7 +14395,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15190,8 +14410,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15207,8 +14425,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15231,7 +14447,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15256,7 +14472,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15281,7 +14497,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15306,7 +14522,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15331,7 +14547,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15346,7 +14561,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15361,7 +14575,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15376,7 +14589,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15391,7 +14603,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15407,7 +14618,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15423,8 +14633,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15440,8 +14648,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15464,7 +14670,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15489,7 +14695,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15514,7 +14720,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15539,7 +14745,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15564,7 +14770,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15579,7 +14784,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15594,7 +14798,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15609,7 +14812,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15624,7 +14826,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15640,7 +14841,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15656,8 +14856,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15673,8 +14871,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15697,7 +14893,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15722,7 +14918,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15747,7 +14943,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15772,7 +14968,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15797,7 +14993,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15812,7 +15007,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15827,7 +15021,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15842,7 +15035,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15857,7 +15049,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15873,7 +15064,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15889,8 +15079,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15906,8 +15094,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15930,7 +15116,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15955,7 +15141,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15980,7 +15166,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16005,7 +15191,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16030,7 +15216,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16045,7 +15230,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16060,7 +15244,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16075,7 +15258,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16090,7 +15272,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16106,7 +15287,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16122,8 +15302,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16139,8 +15317,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16163,7 +15339,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16188,7 +15364,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16213,7 +15389,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16238,7 +15414,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16263,7 +15439,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16278,7 +15453,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16293,7 +15467,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16308,7 +15481,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16323,7 +15495,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16339,7 +15510,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16355,8 +15525,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16372,8 +15540,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16396,6 +15562,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16424,6 +15591,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16452,6 +15620,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16480,6 +15649,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16508,7 +15678,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16526,7 +15695,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16544,7 +15712,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16562,7 +15729,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16580,7 +15746,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16600,7 +15765,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16620,8 +15784,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16641,8 +15803,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16671,6 +15831,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16699,6 +15860,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16727,6 +15889,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16755,6 +15918,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16783,7 +15947,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16801,7 +15964,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16819,7 +15981,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16837,7 +15998,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16855,7 +16015,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16875,7 +16034,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16895,8 +16053,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16916,8 +16072,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16946,6 +16100,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16974,6 +16129,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17002,6 +16158,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17030,6 +16187,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17058,7 +16216,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17076,7 +16233,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17094,7 +16250,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17112,7 +16267,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17130,7 +16284,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17150,7 +16303,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17170,8 +16322,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17191,8 +16341,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17221,6 +16369,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17249,6 +16398,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17277,6 +16427,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17305,6 +16456,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17333,7 +16485,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17351,7 +16502,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17369,7 +16519,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17387,7 +16536,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17405,7 +16553,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17425,7 +16572,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17445,8 +16591,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17466,8 +16610,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17496,6 +16638,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17524,6 +16667,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17552,6 +16696,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17580,6 +16725,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17608,7 +16754,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17626,7 +16771,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17644,7 +16788,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17662,7 +16805,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17680,7 +16822,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17700,7 +16841,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17720,8 +16860,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17741,8 +16879,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17771,6 +16907,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17799,6 +16936,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17827,6 +16965,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17855,6 +16994,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17883,7 +17023,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17901,7 +17040,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17919,7 +17057,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17937,7 +17074,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17955,7 +17091,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17975,7 +17110,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17995,8 +17129,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18016,8 +17148,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18046,6 +17176,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18074,6 +17205,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18102,6 +17234,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18130,6 +17263,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18158,7 +17292,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18176,7 +17309,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18194,7 +17326,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18212,7 +17343,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18230,7 +17360,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18250,7 +17379,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18270,8 +17398,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18291,8 +17417,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18321,6 +17445,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18349,6 +17474,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18377,6 +17503,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18405,6 +17532,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18433,7 +17561,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18451,7 +17578,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18469,7 +17595,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18487,7 +17612,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18505,7 +17629,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18525,7 +17648,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18545,8 +17667,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18566,8 +17686,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18596,6 +17714,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18624,6 +17743,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18652,6 +17772,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18680,6 +17801,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18708,7 +17830,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18726,7 +17847,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18744,7 +17864,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18762,7 +17881,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18780,7 +17898,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18800,7 +17917,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18820,8 +17936,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18841,8 +17955,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18871,6 +17983,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18899,6 +18012,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18927,6 +18041,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18955,6 +18070,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18983,7 +18099,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19001,7 +18116,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19019,7 +18133,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19037,7 +18150,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19055,7 +18167,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19075,7 +18186,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19095,8 +18205,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19116,8 +18224,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19146,6 +18252,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19174,6 +18281,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19202,6 +18310,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19230,6 +18339,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19258,7 +18368,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19276,7 +18385,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19294,7 +18402,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19312,7 +18419,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19330,7 +18436,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19350,7 +18455,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19370,8 +18474,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19391,8 +18493,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19421,6 +18521,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19449,6 +18550,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19477,6 +18579,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19505,6 +18608,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19533,7 +18637,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19551,7 +18654,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19569,7 +18671,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19587,7 +18688,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19605,7 +18705,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19625,7 +18724,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19645,8 +18743,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19666,8 +18762,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19696,6 +18790,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19724,6 +18819,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19752,6 +18848,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19780,6 +18877,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19808,7 +18906,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19826,7 +18923,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19844,7 +18940,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19862,7 +18957,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19880,7 +18974,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19900,7 +18993,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19920,8 +19012,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19941,8 +19031,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19971,6 +19059,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19999,6 +19088,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20027,6 +19117,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20055,6 +19146,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20083,7 +19175,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20101,7 +19192,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20119,7 +19209,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20137,7 +19226,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20155,7 +19243,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20175,7 +19262,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20195,8 +19281,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20216,8 +19300,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20246,6 +19328,7 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20274,6 +19357,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20302,6 +19386,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20330,6 +19415,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20358,7 +19444,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20376,7 +19461,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20394,7 +19478,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20412,7 +19495,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20430,7 +19512,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20450,7 +19531,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20470,8 +19550,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20491,8 +19569,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 038b58deb0cf19..6f9773f7cfd2ee 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -15,9 +15,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX7-LABEL: flat_system_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -30,10 +29,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX10-WGP-LABEL: flat_system_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -46,10 +43,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX10-CU-LABEL: flat_system_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -62,9 +57,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; SKIP-CACHE-INV-LABEL: flat_system_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -77,10 +71,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -91,10 +83,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -105,10 +95,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -119,10 +107,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX940-TGSPLIT-LABEL: flat_system_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -133,9 +119,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX11-WGP-LABEL: flat_system_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -148,9 +133,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX11-CU-LABEL: flat_system_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -163,10 +147,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX12-WGP-LABEL: flat_system_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -179,10 +161,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX12-CU-LABEL: flat_system_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -202,9 +182,8 @@ entry: define amdgpu_kernel void @flat_system_monotonic_load( ; GFX7-LABEL: flat_system_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -217,10 +196,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX10-WGP-LABEL: flat_system_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -233,10 +210,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX10-CU-LABEL: flat_system_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -249,9 +224,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -264,10 +238,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -278,10 +250,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -292,10 +262,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 @@ -306,10 +274,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 @@ -320,9 +286,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX11-WGP-LABEL: flat_system_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -335,9 +300,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX11-CU-LABEL: flat_system_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -350,10 +314,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX12-WGP-LABEL: flat_system_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -366,10 +328,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX12-CU-LABEL: flat_system_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -389,9 +349,8 @@ entry: define amdgpu_kernel void @flat_system_acquire_load( ; GFX7-LABEL: flat_system_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -405,10 +364,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX10-WGP-LABEL: flat_system_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -423,10 +380,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX10-CU-LABEL: flat_system_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -441,9 +396,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -456,10 +410,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -472,10 +424,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -488,10 +438,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 @@ -503,10 +451,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 @@ -518,9 +464,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX11-WGP-LABEL: flat_system_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -535,9 +480,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX11-CU-LABEL: flat_system_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -552,10 +496,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX12-WGP-LABEL: flat_system_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -569,10 +511,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX12-CU-LABEL: flat_system_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -593,9 +533,8 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX7-LABEL: flat_system_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -610,10 +549,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -630,10 +567,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX10-CU-LABEL: flat_system_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -650,9 +585,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -666,10 +600,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -683,10 +615,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -700,10 +630,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -716,10 +644,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -732,9 +658,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_system_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -751,9 +676,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX11-CU-LABEL: flat_system_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -770,10 +694,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_system_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -793,10 +715,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX12-CU-LABEL: flat_system_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -823,8 +743,8 @@ entry: define amdgpu_kernel void @flat_system_unordered_store( ; GFX7-LABEL: flat_system_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -834,9 +754,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX10-WGP-LABEL: flat_system_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -846,9 +765,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX10-CU-LABEL: flat_system_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -858,8 +776,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; SKIP-CACHE-INV-LABEL: flat_system_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -869,9 +787,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -880,9 +797,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -891,9 +807,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -902,9 +817,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX940-TGSPLIT-LABEL: flat_system_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -913,8 +827,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX11-WGP-LABEL: flat_system_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -924,8 +838,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX11-CU-LABEL: flat_system_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -935,8 +849,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX12-WGP-LABEL: flat_system_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -946,8 +860,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX12-CU-LABEL: flat_system_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -963,8 +877,8 @@ entry: define amdgpu_kernel void @flat_system_monotonic_store( ; GFX7-LABEL: flat_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -974,9 +888,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX10-WGP-LABEL: flat_system_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -986,9 +899,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX10-CU-LABEL: flat_system_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -998,8 +910,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1009,9 +921,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1020,9 +931,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1031,9 +941,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1042,9 +951,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1053,8 +961,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX11-WGP-LABEL: flat_system_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1064,8 +972,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX11-CU-LABEL: flat_system_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1075,8 +983,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX12-WGP-LABEL: flat_system_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1086,8 +994,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX12-CU-LABEL: flat_system_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1103,8 +1011,8 @@ entry: define amdgpu_kernel void @flat_system_release_store( ; GFX7-LABEL: flat_system_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1115,9 +1023,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX10-WGP-LABEL: flat_system_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1129,9 +1036,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX10-CU-LABEL: flat_system_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1143,8 +1049,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; SKIP-CACHE-INV-LABEL: flat_system_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1155,9 +1061,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1168,9 +1073,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1181,9 +1085,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1194,9 +1097,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX940-TGSPLIT-LABEL: flat_system_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1207,8 +1109,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX11-WGP-LABEL: flat_system_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1220,8 +1122,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX11-CU-LABEL: flat_system_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1233,8 +1135,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX12-WGP-LABEL: flat_system_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1249,8 +1151,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX12-CU-LABEL: flat_system_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1271,8 +1173,8 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX7-LABEL: flat_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1283,9 +1185,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1297,9 +1198,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX10-CU-LABEL: flat_system_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1311,8 +1211,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1323,9 +1223,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1336,9 +1235,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1349,9 +1247,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1362,9 +1259,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1375,8 +1271,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX11-WGP-LABEL: flat_system_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1388,8 +1284,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX11-CU-LABEL: flat_system_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1401,8 +1297,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX12-WGP-LABEL: flat_system_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1417,8 +1313,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX12-CU-LABEL: flat_system_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1439,9 +1335,8 @@ entry: define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX7-LABEL: flat_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1451,10 +1346,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1464,10 +1357,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1477,9 +1368,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1489,10 +1379,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1501,10 +1389,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1513,10 +1399,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1525,10 +1409,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1537,9 +1419,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1549,9 +1430,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1561,10 +1441,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1574,10 +1452,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1593,9 +1469,8 @@ entry: define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX7-LABEL: flat_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1607,10 +1482,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1624,10 +1497,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1641,9 +1512,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1654,10 +1524,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1669,10 +1537,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1684,10 +1550,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1698,10 +1562,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1712,9 +1574,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1728,9 +1589,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1744,10 +1604,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1759,10 +1617,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1780,9 +1636,8 @@ entry: define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX7-LABEL: flat_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1793,10 +1648,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1808,10 +1661,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1823,9 +1674,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1836,10 +1686,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1850,10 +1698,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1864,10 +1710,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1878,10 +1722,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1892,9 +1734,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1906,9 +1747,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1920,10 +1760,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1938,10 +1776,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1962,9 +1798,8 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX7-LABEL: flat_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1977,10 +1812,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1996,10 +1829,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -2015,9 +1846,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -2029,10 +1859,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2046,10 +1874,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2063,10 +1889,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2079,10 +1903,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2095,9 +1917,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2113,9 +1934,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2131,10 +1951,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2151,10 +1969,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2177,9 +1993,8 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX7-LABEL: flat_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -2192,10 +2007,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -2211,10 +2024,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -2230,9 +2041,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -2244,10 +2054,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2261,10 +2069,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2278,10 +2084,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2294,10 +2098,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2310,9 +2112,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2328,9 +2129,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2346,10 +2146,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2366,10 +2164,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2392,8 +2188,8 @@ entry: define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2408,9 +2204,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2426,9 +2221,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2444,8 +2238,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2459,9 +2253,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2475,9 +2268,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2491,9 +2283,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2506,9 +2297,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2521,8 +2311,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2538,8 +2328,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2555,8 +2345,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2571,8 +2361,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2594,8 +2384,8 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2611,9 +2401,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2631,9 +2420,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2651,8 +2439,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2667,9 +2455,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2685,9 +2472,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2703,9 +2489,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2720,9 +2505,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2737,8 +2521,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2756,8 +2540,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2775,8 +2559,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2798,8 +2582,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2828,8 +2612,8 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2845,9 +2629,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2865,9 +2648,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2885,8 +2667,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2901,9 +2683,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2919,9 +2700,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2937,9 +2717,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2954,9 +2733,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2971,8 +2749,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2990,8 +2768,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -3009,8 +2787,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -3032,8 +2810,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -3062,7 +2840,7 @@ entry: define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3087,7 +2865,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3112,7 +2890,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3137,7 +2915,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3162,7 +2940,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3177,7 +2954,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3192,7 +2968,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3207,7 +2982,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3222,7 +2996,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3238,7 +3011,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3254,8 +3026,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3271,8 +3041,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3295,7 +3063,7 @@ entry: define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3322,7 +3090,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3351,7 +3119,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3380,7 +3148,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3406,7 +3174,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3424,7 +3191,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3442,7 +3208,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3459,7 +3224,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3476,7 +3240,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3496,7 +3259,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3516,8 +3278,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3535,8 +3295,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3561,7 +3319,7 @@ entry: define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3587,7 +3345,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3614,7 +3372,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3641,7 +3399,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3667,7 +3425,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3684,7 +3441,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3701,7 +3457,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3718,7 +3473,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3735,7 +3489,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3753,7 +3506,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3771,8 +3523,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3793,8 +3543,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3822,7 +3570,7 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3850,7 +3598,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3881,7 +3629,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3912,7 +3660,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3939,7 +3687,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3959,7 +3706,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3979,7 +3725,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3998,7 +3743,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4017,7 +3761,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4039,7 +3782,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4061,8 +3803,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4085,8 +3825,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4116,7 +3854,7 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4144,7 +3882,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4175,7 +3913,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4206,7 +3944,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4233,7 +3971,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4253,7 +3990,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4273,7 +4009,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4292,7 +4027,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4311,7 +4045,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4333,7 +4066,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4355,8 +4087,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4379,8 +4109,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4410,7 +4138,7 @@ entry: define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4437,7 +4165,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4466,7 +4194,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4495,7 +4223,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4521,7 +4249,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4539,7 +4266,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4557,7 +4283,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4574,7 +4299,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4591,7 +4315,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4611,7 +4334,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4631,8 +4353,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4650,8 +4370,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4676,7 +4394,7 @@ entry: define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4703,7 +4421,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4732,7 +4450,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4761,7 +4479,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4787,7 +4505,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4805,7 +4522,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4823,7 +4539,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4840,7 +4555,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4857,7 +4571,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4877,7 +4590,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4897,8 +4609,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4916,8 +4626,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4942,7 +4650,7 @@ entry: define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX7-LABEL: flat_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4970,7 +4678,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5001,7 +4709,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5032,7 +4740,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5059,7 +4767,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5079,7 +4786,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5099,7 +4805,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5118,7 +4823,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5137,7 +4841,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5159,7 +4862,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5181,8 +4883,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5205,8 +4905,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5236,7 +4934,7 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5264,7 +4962,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5295,7 +4993,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5326,7 +5024,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5353,7 +5051,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5373,7 +5070,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5393,7 +5089,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5412,7 +5107,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5431,7 +5125,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5453,7 +5146,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5475,8 +5167,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5499,8 +5189,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5530,7 +5218,7 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5558,7 +5246,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5589,7 +5277,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5620,7 +5308,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5647,7 +5335,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5667,7 +5354,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5687,7 +5373,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5706,7 +5391,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5725,7 +5409,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5747,7 +5430,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5769,8 +5451,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5793,8 +5473,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5824,7 +5502,7 @@ entry: define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5852,7 +5530,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5883,7 +5561,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5914,7 +5592,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5941,7 +5619,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5961,7 +5638,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5981,7 +5657,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6000,7 +5675,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6019,7 +5693,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6041,7 +5714,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6063,8 +5735,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6087,8 +5757,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6118,7 +5786,7 @@ entry: define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6146,7 +5814,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6177,7 +5845,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6208,7 +5876,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -6235,7 +5903,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6255,7 +5922,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6275,7 +5941,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6294,7 +5959,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6313,7 +5977,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6335,7 +5998,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6357,8 +6019,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6381,8 +6041,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6412,7 +6070,7 @@ entry: define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6440,7 +6098,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6471,7 +6129,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6502,7 +6160,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -6529,7 +6187,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6549,7 +6206,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6569,7 +6225,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6588,7 +6243,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6607,7 +6261,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6629,7 +6282,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6651,8 +6303,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6675,8 +6325,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6706,7 +6354,7 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6734,7 +6382,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6765,7 +6413,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -6796,7 +6444,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -6823,7 +6471,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6843,7 +6490,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6863,7 +6509,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6882,7 +6527,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6901,7 +6545,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6923,7 +6566,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6945,8 +6587,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6969,8 +6609,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7000,7 +6638,7 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -7028,7 +6666,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -7059,7 +6697,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -7090,7 +6728,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -7117,7 +6755,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7137,7 +6774,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7157,7 +6793,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7176,7 +6811,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7195,7 +6829,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7217,7 +6850,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7239,8 +6871,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7263,8 +6893,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7294,6 +6922,7 @@ entry: define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7322,6 +6951,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7350,6 +6980,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7378,6 +7009,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7406,7 +7038,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7424,7 +7055,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7442,7 +7072,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7460,7 +7089,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7478,7 +7106,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7498,7 +7125,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7518,8 +7144,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7539,8 +7163,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7569,6 +7191,7 @@ entry: define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7598,6 +7221,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7628,6 +7252,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7658,6 +7283,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7686,7 +7312,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7706,7 +7331,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7726,7 +7350,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7745,7 +7368,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7764,7 +7386,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7786,7 +7407,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7808,8 +7428,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7830,8 +7448,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7861,6 +7477,7 @@ entry: define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7890,6 +7507,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7920,6 +7538,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7950,6 +7569,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7979,7 +7599,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7999,7 +7618,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8019,7 +7637,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8039,7 +7656,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8059,7 +7675,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8081,7 +7696,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8103,8 +7717,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8129,8 +7741,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8164,6 +7774,7 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8194,6 +7805,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8226,6 +7838,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8258,6 +7871,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8287,7 +7901,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8309,7 +7922,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8331,7 +7943,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8352,7 +7963,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8373,7 +7983,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8397,7 +8006,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8421,8 +8029,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8450,8 +8056,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8488,6 +8092,7 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8518,6 +8123,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8550,6 +8156,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8582,6 +8189,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8611,7 +8219,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8633,7 +8240,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8655,7 +8261,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8676,7 +8281,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8697,7 +8301,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8721,7 +8324,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8745,8 +8347,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8774,8 +8374,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8812,6 +8410,7 @@ entry: define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8841,6 +8440,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8871,6 +8471,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8901,6 +8502,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8929,7 +8531,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8949,7 +8550,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8969,7 +8569,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8988,7 +8587,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9007,7 +8605,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9029,7 +8626,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9051,8 +8647,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9075,8 +8669,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9108,6 +8700,7 @@ entry: define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9137,6 +8730,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9167,6 +8761,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9197,6 +8792,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9225,7 +8821,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9245,7 +8840,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9265,7 +8859,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9284,7 +8877,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9303,7 +8895,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9325,7 +8916,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9347,8 +8937,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9369,8 +8957,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9400,6 +8986,7 @@ entry: define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9430,6 +9017,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9462,6 +9050,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9494,6 +9083,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9523,7 +9113,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9545,7 +9134,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9567,7 +9155,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9588,7 +9175,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9609,7 +9195,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9633,7 +9218,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9657,8 +9241,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9686,8 +9268,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9724,6 +9304,7 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9754,6 +9335,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9786,6 +9368,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9818,6 +9401,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9847,7 +9431,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9869,7 +9452,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9891,7 +9473,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9912,7 +9493,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9933,7 +9513,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9957,7 +9536,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9981,8 +9559,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10010,8 +9586,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10048,6 +9622,7 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10078,6 +9653,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10110,6 +9686,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10142,6 +9719,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10171,7 +9749,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10193,7 +9770,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10215,7 +9791,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10236,7 +9811,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10257,7 +9831,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10281,7 +9854,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10305,8 +9877,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10334,8 +9904,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10372,6 +9940,7 @@ entry: define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10402,6 +9971,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10434,6 +10004,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10466,6 +10037,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10495,7 +10067,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10517,7 +10088,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10539,7 +10109,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10560,7 +10129,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10581,7 +10149,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10605,7 +10172,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10629,8 +10195,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10658,8 +10222,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10696,6 +10258,7 @@ entry: define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10726,6 +10289,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10758,6 +10322,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10790,6 +10355,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10819,7 +10385,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10841,7 +10406,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10863,7 +10427,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10884,7 +10447,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10905,7 +10467,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10929,7 +10490,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10953,8 +10513,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10980,8 +10538,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11016,6 +10572,7 @@ entry: define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11046,6 +10603,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11078,6 +10636,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11110,6 +10669,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -11139,7 +10699,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11161,7 +10720,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11183,7 +10741,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11204,7 +10761,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11225,7 +10781,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11249,7 +10804,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11273,8 +10827,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11302,8 +10854,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11340,6 +10890,7 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11370,6 +10921,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11402,6 +10954,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11434,6 +10987,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -11463,7 +11017,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11485,7 +11038,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11507,7 +11059,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11528,7 +11079,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11549,7 +11099,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11573,7 +11122,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11597,8 +11145,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11626,8 +11172,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11664,6 +11208,7 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11694,6 +11239,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11726,6 +11272,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -11758,6 +11305,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -11787,7 +11335,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11809,7 +11356,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11831,7 +11377,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11852,7 +11397,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -11873,7 +11417,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11897,7 +11440,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11921,8 +11463,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11950,8 +11490,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -11988,9 +11526,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX7-LABEL: flat_system_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12003,10 +11540,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12019,10 +11554,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_system_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12035,9 +11568,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12050,10 +11582,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -12064,10 +11594,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -12078,10 +11606,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -12092,10 +11618,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -12106,9 +11630,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX11-WGP-LABEL: flat_system_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12121,9 +11644,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX11-CU-LABEL: flat_system_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12136,10 +11658,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX12-WGP-LABEL: flat_system_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12152,10 +11672,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX12-CU-LABEL: flat_system_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12175,9 +11693,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX7-LABEL: flat_system_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12190,10 +11707,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12206,10 +11721,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12222,9 +11735,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12237,10 +11749,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -12251,10 +11761,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -12265,10 +11773,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 @@ -12279,10 +11785,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 @@ -12293,9 +11797,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12308,9 +11811,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12323,10 +11825,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12339,10 +11839,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12362,9 +11860,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX7-LABEL: flat_system_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12379,10 +11876,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12398,10 +11893,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12417,9 +11910,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12433,10 +11925,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -12450,10 +11940,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -12466,10 +11954,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 @@ -12482,10 +11968,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1 @@ -12497,9 +11981,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12515,9 +11998,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12533,10 +12015,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX12-WGP-LABEL: flat_system_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12551,10 +12031,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX12-CU-LABEL: flat_system_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12576,9 +12054,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX7-LABEL: flat_system_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12594,10 +12071,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12615,10 +12090,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12636,9 +12109,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12653,10 +12125,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12671,10 +12141,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12688,10 +12156,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12705,10 +12171,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12721,9 +12185,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12741,9 +12204,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12761,10 +12223,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12785,10 +12245,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12816,8 +12274,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX7-LABEL: flat_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12827,9 +12285,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12839,9 +12296,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_system_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12851,8 +12307,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12862,9 +12318,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12873,9 +12328,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12884,9 +12338,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12895,9 +12348,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12906,8 +12358,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX11-WGP-LABEL: flat_system_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12917,8 +12369,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX11-CU-LABEL: flat_system_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12928,8 +12380,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX12-WGP-LABEL: flat_system_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12939,8 +12391,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX12-CU-LABEL: flat_system_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12956,8 +12408,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX7-LABEL: flat_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12967,9 +12419,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12979,9 +12430,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12991,8 +12441,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -13002,9 +12452,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13013,9 +12462,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13024,9 +12472,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13035,9 +12482,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13046,8 +12492,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13057,8 +12503,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13068,8 +12514,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13079,8 +12525,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13096,8 +12542,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX7-LABEL: flat_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13108,9 +12554,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -13122,9 +12567,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX10-CU-LABEL: flat_system_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -13136,8 +12580,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -13148,9 +12592,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13161,9 +12604,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13174,9 +12616,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13187,9 +12628,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13200,8 +12640,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX11-WGP-LABEL: flat_system_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13213,8 +12653,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX11-CU-LABEL: flat_system_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13226,8 +12666,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX12-WGP-LABEL: flat_system_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13242,8 +12682,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX12-CU-LABEL: flat_system_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13264,8 +12704,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX7-LABEL: flat_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13276,9 +12716,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -13290,9 +12729,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -13304,8 +12742,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -13316,9 +12754,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13329,9 +12766,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13342,9 +12778,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13355,9 +12790,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13368,8 +12802,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13381,8 +12815,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13394,8 +12828,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13410,8 +12844,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13432,9 +12866,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13444,10 +12877,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -13457,10 +12888,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -13470,9 +12899,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -13482,10 +12910,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13494,10 +12920,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13506,10 +12930,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13518,10 +12940,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13530,9 +12950,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13542,9 +12961,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13554,10 +12972,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13567,10 +12983,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13586,9 +13000,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13600,10 +13013,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -13616,10 +13027,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -13632,9 +13041,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -13645,10 +13053,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13660,10 +13066,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13675,10 +13079,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13689,10 +13091,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13703,9 +13103,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13718,9 +13117,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13733,10 +13131,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13748,10 +13144,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13769,9 +13163,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX7-LABEL: flat_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13782,10 +13175,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -13797,10 +13188,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -13812,9 +13201,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -13825,10 +13213,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13839,10 +13225,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -13853,10 +13237,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13867,10 +13249,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -13881,9 +13261,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13895,9 +13274,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13909,10 +13287,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -13927,10 +13303,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -13951,9 +13325,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13966,10 +13339,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -13984,10 +13355,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -14002,9 +13371,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -14016,10 +13384,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -14033,10 +13399,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -14050,10 +13414,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -14066,10 +13428,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -14082,9 +13442,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -14099,9 +13458,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -14116,10 +13474,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -14136,10 +13492,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -14162,9 +13516,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -14177,10 +13530,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -14195,10 +13546,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -14213,9 +13562,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -14227,10 +13575,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -14244,10 +13590,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -14261,10 +13605,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -14277,10 +13619,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -14293,9 +13633,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -14310,9 +13649,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -14327,10 +13665,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -14347,10 +13683,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -14373,8 +13707,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -14390,9 +13724,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -14409,9 +13742,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -14428,8 +13760,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -14444,9 +13776,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -14461,9 +13792,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -14477,9 +13807,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -14493,9 +13822,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -14508,8 +13836,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -14526,8 +13854,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -14544,8 +13872,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -14561,8 +13889,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -14585,8 +13913,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -14603,9 +13931,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -14624,9 +13951,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -14645,8 +13971,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -14662,9 +13988,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -14681,9 +14006,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -14699,9 +14023,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -14717,9 +14040,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -14734,8 +14056,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -14754,8 +14076,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -14774,8 +14096,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -14798,8 +14120,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -14829,8 +14151,8 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -14847,9 +14169,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -14868,9 +14189,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -14889,8 +14209,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -14906,9 +14226,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -14925,9 +14244,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -14943,9 +14261,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -14961,9 +14278,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -14978,8 +14294,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -14998,8 +14314,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -15018,8 +14334,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -15042,8 +14358,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -15073,7 +14389,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15098,7 +14414,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15123,7 +14439,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15148,7 +14464,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15173,7 +14489,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15188,7 +14503,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15203,7 +14517,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15218,7 +14531,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15233,7 +14545,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15249,7 +14560,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15265,8 +14575,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15282,8 +14590,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15306,7 +14612,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15333,7 +14639,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15361,7 +14667,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15389,7 +14695,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15415,7 +14721,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15433,7 +14738,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15451,7 +14755,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15468,7 +14771,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15485,7 +14787,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15504,7 +14805,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15523,8 +14823,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15542,8 +14840,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15568,7 +14864,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15594,7 +14890,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15621,7 +14917,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15648,7 +14944,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15674,7 +14970,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15691,7 +14986,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15708,7 +15002,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15725,7 +15018,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15742,7 +15034,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15760,7 +15051,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15778,8 +15068,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15800,8 +15088,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15829,7 +15115,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15857,7 +15143,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15887,7 +15173,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15917,7 +15203,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15944,7 +15230,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15964,7 +15249,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15984,7 +15268,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16003,7 +15286,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16022,7 +15304,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16043,7 +15324,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16064,8 +15344,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16088,8 +15366,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16119,7 +15395,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16147,7 +15423,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16177,7 +15453,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16207,7 +15483,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16234,7 +15510,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16254,7 +15529,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16274,7 +15548,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16293,7 +15566,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16312,7 +15584,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16333,7 +15604,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16354,8 +15624,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16378,8 +15646,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16409,7 +15675,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16436,7 +15702,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16464,7 +15730,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16492,7 +15758,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16518,7 +15784,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16536,7 +15801,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16554,7 +15818,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16571,7 +15834,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16588,7 +15850,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16607,7 +15868,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16626,8 +15886,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16645,8 +15903,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16671,7 +15927,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16698,7 +15954,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16726,7 +15982,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16754,7 +16010,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16780,7 +16036,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16798,7 +16053,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16816,7 +16070,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16833,7 +16086,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16850,7 +16102,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16869,7 +16120,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16888,8 +16138,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16907,8 +16155,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16933,7 +16179,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16961,7 +16207,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16991,7 +16237,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17021,7 +16267,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -17048,7 +16294,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17068,7 +16313,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17088,7 +16332,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17107,7 +16350,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17126,7 +16368,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17147,7 +16388,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17168,8 +16408,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17192,8 +16430,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17223,7 +16459,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17251,7 +16487,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17281,7 +16517,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17311,7 +16547,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -17338,7 +16574,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17358,7 +16593,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17378,7 +16612,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17397,7 +16630,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17416,7 +16648,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17437,7 +16668,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17458,8 +16688,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17482,8 +16710,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17513,7 +16739,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17541,7 +16767,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17571,7 +16797,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17601,7 +16827,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -17628,7 +16854,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17648,7 +16873,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17668,7 +16892,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17687,7 +16910,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17706,7 +16928,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17727,7 +16948,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17748,8 +16968,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17772,8 +16990,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17803,7 +17019,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17831,7 +17047,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17861,7 +17077,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -17891,7 +17107,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -17918,7 +17134,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17938,7 +17153,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17958,7 +17172,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17977,7 +17190,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17996,7 +17208,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18017,7 +17228,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18038,8 +17248,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18062,8 +17270,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18093,7 +17299,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -18121,7 +17327,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -18151,7 +17357,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -18181,7 +17387,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -18208,7 +17414,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18228,7 +17433,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18248,7 +17452,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18267,7 +17470,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18286,7 +17488,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18307,7 +17508,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18328,8 +17528,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18352,8 +17550,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18383,7 +17579,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -18411,7 +17607,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -18441,7 +17637,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -18471,7 +17667,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -18498,7 +17694,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18518,7 +17713,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18538,7 +17732,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18557,7 +17750,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18576,7 +17768,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18597,7 +17788,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18618,8 +17808,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18642,8 +17830,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18673,7 +17859,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -18701,7 +17887,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -18731,7 +17917,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -18761,7 +17947,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -18788,7 +17974,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18808,7 +17993,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18828,7 +18012,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18847,7 +18030,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18866,7 +18048,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18887,7 +18068,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18908,8 +18088,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18932,8 +18110,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18963,7 +18139,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -18991,7 +18167,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -19021,7 +18197,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -19051,7 +18227,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -19078,7 +18254,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19098,7 +18273,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19118,7 +18292,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19137,7 +18310,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19156,7 +18328,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19177,7 +18348,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19198,8 +18368,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19222,8 +18390,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19253,6 +18419,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19281,6 +18448,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19309,6 +18477,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19337,6 +18506,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19365,7 +18535,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19383,7 +18552,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19401,7 +18569,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19419,7 +18586,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19437,7 +18603,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19457,7 +18622,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19477,8 +18641,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19498,8 +18660,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19528,6 +18688,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19558,6 +18719,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19589,6 +18751,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19620,6 +18783,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19649,7 +18813,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19670,7 +18833,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19690,7 +18852,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19710,7 +18871,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19729,7 +18889,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19752,7 +18911,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19775,8 +18933,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19798,8 +18954,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19830,6 +18984,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19859,6 +19014,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19889,6 +19045,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19919,6 +19076,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19948,7 +19106,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19968,7 +19125,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19988,7 +19144,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20008,7 +19163,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20028,7 +19182,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20050,7 +19203,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20072,8 +19224,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20098,8 +19248,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20133,6 +19281,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20164,6 +19313,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20197,6 +19347,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20230,6 +19381,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20260,7 +19412,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20283,7 +19434,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20305,7 +19455,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20327,7 +19476,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20348,7 +19496,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20373,7 +19520,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20398,8 +19544,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20428,8 +19572,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20467,6 +19609,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20498,6 +19641,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20531,6 +19675,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20564,6 +19709,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20594,7 +19740,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20617,7 +19762,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20639,7 +19783,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20661,7 +19804,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20682,7 +19824,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20707,7 +19848,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20732,8 +19872,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20762,8 +19900,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20801,6 +19937,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20831,6 +19968,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20862,6 +20000,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20893,6 +20032,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20922,7 +20062,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20943,7 +20082,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20963,7 +20101,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20983,7 +20120,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21002,7 +20138,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21025,7 +20160,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21048,8 +20182,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21073,8 +20205,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21107,6 +20237,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21137,6 +20268,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21168,6 +20300,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21199,6 +20332,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21228,7 +20362,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21249,7 +20382,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21269,7 +20401,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21289,7 +20420,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21308,7 +20438,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21331,7 +20460,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21354,8 +20482,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21377,8 +20503,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21409,6 +20533,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21440,6 +20565,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21473,6 +20599,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21506,6 +20633,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21536,7 +20664,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21559,7 +20686,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21581,7 +20707,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21603,7 +20728,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21624,7 +20748,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21649,7 +20772,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21674,8 +20796,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21704,8 +20824,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21743,6 +20861,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21774,6 +20893,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21807,6 +20927,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -21840,6 +20961,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21870,7 +20992,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21893,7 +21014,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21915,7 +21035,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21937,7 +21056,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21958,7 +21076,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21983,7 +21100,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22008,8 +21124,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22038,8 +21152,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22077,6 +21189,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22108,6 +21221,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22141,6 +21255,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22174,6 +21289,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -22204,7 +21320,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22227,7 +21342,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22249,7 +21363,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22271,7 +21384,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22292,7 +21404,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22317,7 +21428,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22342,8 +21452,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22372,8 +21480,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22411,6 +21517,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22442,6 +21549,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22475,6 +21583,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22508,6 +21617,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -22538,7 +21648,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22561,7 +21670,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22583,7 +21691,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22605,7 +21712,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22626,7 +21732,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22651,7 +21756,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22676,8 +21780,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22706,8 +21808,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22745,6 +21845,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22776,6 +21877,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22809,6 +21911,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -22842,6 +21945,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -22872,7 +21976,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22895,7 +21998,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22917,7 +22019,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22939,7 +22040,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -22960,7 +22060,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -22985,7 +22084,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23010,8 +22108,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23038,8 +22134,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23075,6 +22169,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23106,6 +22201,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23139,6 +22235,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23172,6 +22269,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -23202,7 +22300,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23225,7 +22322,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23247,7 +22343,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23269,7 +22364,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23290,7 +22384,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23315,7 +22408,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23340,8 +22432,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23370,8 +22460,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23409,6 +22497,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23440,6 +22529,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23473,6 +22563,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23506,6 +22597,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -23536,7 +22628,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23559,7 +22650,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23581,7 +22671,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23603,7 +22692,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23624,7 +22712,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23649,7 +22736,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23674,8 +22760,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23704,8 +22788,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23743,6 +22825,7 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -23774,6 +22857,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23807,6 +22891,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -23840,6 +22925,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -23870,7 +22956,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23893,7 +22978,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -23915,7 +22999,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23937,7 +23020,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -23958,7 +23040,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -23983,7 +23064,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -24008,8 +23088,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -24038,8 +23116,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index ebae2b6152e7bf..f10715033e4338 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -11,9 +11,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -27,10 +26,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -44,10 +41,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-CU-LABEL: flat_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -61,9 +56,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -77,9 +71,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX11-WGP-LABEL: flat_nontemporal_load_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -93,9 +86,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX11-CU-LABEL: flat_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -109,10 +101,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX12-WGP-LABEL: flat_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -128,10 +118,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX12-CU-LABEL: flat_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -154,8 +142,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-LABEL: flat_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -183,8 +172,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_nop 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s6, v0 ; GFX10-WGP-NEXT: s_mov_b32 s6, 0 @@ -211,8 +202,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-CU-LABEL: flat_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_nop 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s6, v0 ; GFX10-CU-NEXT: s_mov_b32 s6, 0 @@ -239,8 +232,9 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s2, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0 @@ -268,8 +262,9 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX11-WGP-LABEL: flat_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX11-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX11-WGP-NEXT: s_mov_b32 s2, 2 @@ -298,8 +293,9 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX11-CU-LABEL: flat_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX11-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX11-CU-NEXT: s_mov_b32 s2, 2 @@ -328,8 +324,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX12-WGP-LABEL: flat_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX12-WGP-NEXT: s_wait_alu 0xfffe +; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 @@ -365,8 +363,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX12-CU-LABEL: flat_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX12-CU-NEXT: s_wait_alu 0xfffe +; GFX12-CU-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 @@ -411,9 +411,8 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX7-LABEL: flat_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -427,10 +426,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -444,10 +441,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-CU-LABEL: flat_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -461,9 +456,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -477,9 +471,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX11-WGP-LABEL: flat_nontemporal_store_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -493,9 +486,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX11-CU-LABEL: flat_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -509,10 +501,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX12-WGP-LABEL: flat_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -530,10 +520,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX12-CU-LABEL: flat_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -558,8 +546,8 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-LABEL: flat_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 @@ -587,9 +575,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 @@ -616,9 +603,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-CU-LABEL: flat_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 @@ -645,8 +631,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 @@ -674,8 +660,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX11-WGP-LABEL: flat_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -704,8 +690,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX11-CU-LABEL: flat_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -734,8 +720,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX12-WGP-LABEL: flat_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -773,8 +759,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX12-CU-LABEL: flat_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -821,9 +807,8 @@ entry: define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX7-LABEL: flat_volatile_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -837,10 +822,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -854,10 +837,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -871,9 +852,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -887,9 +867,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX11-WGP-LABEL: flat_volatile_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -903,9 +882,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX11-CU-LABEL: flat_volatile_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -919,10 +897,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX12-WGP-LABEL: flat_volatile_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -936,10 +912,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX12-CU-LABEL: flat_volatile_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -960,8 +934,8 @@ entry: define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX7-LABEL: flat_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -972,9 +946,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -986,9 +959,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX10-CU-LABEL: flat_volatile_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -999,8 +971,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1011,8 +983,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX11-WGP-LABEL: flat_volatile_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1024,8 +996,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX11-CU-LABEL: flat_volatile_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1036,8 +1008,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX12-WGP-LABEL: flat_volatile_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1051,8 +1023,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX12-CU-LABEL: flat_volatile_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 23982f8a00cdb8..0d837e42c6155e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -15,9 +15,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX7-LABEL: flat_wavefront_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -30,10 +29,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX10-WGP-LABEL: flat_wavefront_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -46,10 +43,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX10-CU-LABEL: flat_wavefront_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -62,9 +57,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -77,10 +71,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -91,10 +83,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -105,10 +95,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -119,10 +107,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -133,9 +119,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX11-WGP-LABEL: flat_wavefront_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -148,9 +133,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX11-CU-LABEL: flat_wavefront_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -163,10 +147,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX12-WGP-LABEL: flat_wavefront_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -179,10 +161,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX12-CU-LABEL: flat_wavefront_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -202,9 +182,8 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX7-LABEL: flat_wavefront_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -217,10 +196,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -233,10 +210,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -249,9 +224,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -264,10 +238,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -278,10 +250,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -292,10 +262,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -306,10 +274,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -320,9 +286,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -335,9 +300,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -350,10 +314,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -366,10 +328,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX12-CU-LABEL: flat_wavefront_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -389,9 +349,8 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX7-LABEL: flat_wavefront_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -404,10 +363,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -420,10 +377,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -436,9 +391,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -451,10 +405,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -465,10 +417,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -479,10 +429,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -493,10 +441,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -507,9 +453,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -522,9 +467,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX11-CU-LABEL: flat_wavefront_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -537,10 +481,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -553,10 +495,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX12-CU-LABEL: flat_wavefront_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -576,9 +516,8 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX7-LABEL: flat_wavefront_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -591,10 +530,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -607,10 +544,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -623,9 +558,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -638,10 +572,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -652,10 +584,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -666,10 +596,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -680,10 +608,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -694,9 +620,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -709,9 +634,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -724,10 +648,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -740,10 +662,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX12-CU-LABEL: flat_wavefront_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -763,8 +683,8 @@ entry: define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX7-LABEL: flat_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -774,9 +694,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX10-WGP-LABEL: flat_wavefront_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -786,9 +705,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX10-CU-LABEL: flat_wavefront_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -798,8 +716,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -809,9 +727,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -820,9 +737,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -831,9 +747,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -842,9 +757,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -853,8 +767,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX11-WGP-LABEL: flat_wavefront_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -864,8 +778,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX11-CU-LABEL: flat_wavefront_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -875,8 +789,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX12-WGP-LABEL: flat_wavefront_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -886,8 +800,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX12-CU-LABEL: flat_wavefront_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -903,8 +817,8 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX7-LABEL: flat_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -914,9 +828,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -926,9 +839,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -938,8 +850,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -949,9 +861,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -960,9 +871,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -971,9 +881,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -982,9 +891,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -993,8 +901,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1004,8 +912,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1015,8 +923,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1026,8 +934,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX12-CU-LABEL: flat_wavefront_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1043,8 +951,8 @@ entry: define amdgpu_kernel void @flat_wavefront_release_store( ; GFX7-LABEL: flat_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1054,9 +962,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX10-WGP-LABEL: flat_wavefront_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1066,9 +973,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX10-CU-LABEL: flat_wavefront_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1078,8 +984,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1089,9 +995,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1100,9 +1005,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1111,9 +1015,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1122,9 +1025,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1133,8 +1035,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX11-WGP-LABEL: flat_wavefront_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1144,8 +1046,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX11-CU-LABEL: flat_wavefront_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1155,8 +1057,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX12-WGP-LABEL: flat_wavefront_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1166,8 +1068,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX12-CU-LABEL: flat_wavefront_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1183,8 +1085,8 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX7-LABEL: flat_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1194,9 +1096,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1206,9 +1107,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1218,8 +1118,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1229,9 +1129,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1240,9 +1139,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1251,9 +1149,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1262,9 +1159,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1273,8 +1169,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1284,8 +1180,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1295,8 +1191,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1306,8 +1202,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX12-CU-LABEL: flat_wavefront_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1323,9 +1219,8 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1335,10 +1230,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1348,10 +1241,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1361,9 +1252,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1373,10 +1263,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1385,10 +1273,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1397,10 +1283,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1409,10 +1293,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1421,9 +1303,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1433,9 +1314,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1445,10 +1325,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1458,10 +1336,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1477,9 +1353,8 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX7-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1489,10 +1364,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1502,10 +1375,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1515,9 +1386,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1527,10 +1397,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1539,10 +1407,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1551,10 +1417,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1563,10 +1427,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1575,9 +1437,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1587,9 +1448,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1599,10 +1459,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1612,10 +1470,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1631,9 +1487,8 @@ entry: define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX7-LABEL: flat_wavefront_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1643,10 +1498,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1656,10 +1509,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1669,9 +1520,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1681,10 +1531,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1693,10 +1541,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1705,10 +1551,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1717,10 +1561,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1729,9 +1571,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1741,9 +1582,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1753,10 +1593,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1766,10 +1604,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1785,9 +1621,8 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1797,10 +1632,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1810,10 +1643,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1823,9 +1654,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1835,10 +1665,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1847,10 +1675,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1859,10 +1685,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1871,10 +1695,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1883,9 +1705,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1895,9 +1716,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1907,10 +1727,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1920,10 +1738,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1939,9 +1755,8 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1951,10 +1766,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1964,10 +1777,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1977,9 +1788,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1989,10 +1799,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2001,10 +1809,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2013,10 +1819,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2025,10 +1829,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2037,9 +1839,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2049,9 +1850,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2061,10 +1861,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2074,10 +1872,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2093,8 +1889,8 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2108,9 +1904,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2124,9 +1919,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2140,8 +1934,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2155,9 +1949,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2169,9 +1962,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2183,9 +1975,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2197,9 +1988,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2211,8 +2001,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2226,8 +2016,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2241,8 +2031,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2256,8 +2046,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2278,8 +2068,8 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2293,9 +2083,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2309,9 +2098,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2325,8 +2113,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2340,9 +2128,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2354,9 +2141,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2368,9 +2154,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2382,9 +2167,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2396,8 +2180,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2411,8 +2195,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2426,8 +2210,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2441,8 +2225,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2463,8 +2247,8 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2478,9 +2262,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2494,9 +2277,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2510,8 +2292,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2525,9 +2307,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2539,9 +2320,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2553,9 +2333,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2567,9 +2346,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2581,8 +2359,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2596,8 +2374,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2611,8 +2389,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2626,8 +2404,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2648,7 +2426,7 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -2673,7 +2451,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -2698,7 +2476,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -2723,7 +2501,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -2748,7 +2526,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2763,7 +2540,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2778,7 +2554,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -2793,7 +2568,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -2808,7 +2582,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -2824,7 +2597,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -2840,8 +2612,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -2857,8 +2627,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -2881,7 +2649,7 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -2906,7 +2674,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -2931,7 +2699,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -2956,7 +2724,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -2981,7 +2749,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2996,7 +2763,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3011,7 +2777,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3026,7 +2791,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3041,7 +2805,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3057,7 +2820,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3073,8 +2835,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3090,8 +2850,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3114,7 +2872,7 @@ entry: define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3139,7 +2897,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3164,7 +2922,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3189,7 +2947,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3214,7 +2972,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3229,7 +2986,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3244,7 +3000,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3259,7 +3014,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3274,7 +3028,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3290,7 +3043,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3306,8 +3058,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3323,8 +3073,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3347,7 +3095,7 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3372,7 +3120,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3397,7 +3145,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3422,7 +3170,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3447,7 +3195,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3462,7 +3209,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3477,7 +3223,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3492,7 +3237,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3507,7 +3251,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3523,7 +3266,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3539,8 +3281,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3556,8 +3296,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3580,7 +3318,7 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3605,7 +3343,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3630,7 +3368,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3655,7 +3393,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3680,7 +3418,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3695,7 +3432,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3710,7 +3446,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3725,7 +3460,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3740,7 +3474,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3756,7 +3489,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3772,8 +3504,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3789,8 +3519,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3813,7 +3541,7 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3838,7 +3566,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3863,7 +3591,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3888,7 +3616,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3913,7 +3641,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3928,7 +3655,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3943,7 +3669,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3958,7 +3683,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3973,7 +3697,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3989,7 +3712,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4005,8 +3727,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4022,8 +3742,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4046,7 +3764,7 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4071,7 +3789,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4096,7 +3814,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4121,7 +3839,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4146,7 +3864,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4161,7 +3878,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4176,7 +3892,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4191,7 +3906,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4206,7 +3920,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4222,7 +3935,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4238,8 +3950,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4255,8 +3965,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4279,7 +3987,7 @@ entry: define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4304,7 +4012,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4329,7 +4037,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4354,7 +4062,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4379,7 +4087,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4394,7 +4101,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4409,7 +4115,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4424,7 +4129,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4439,7 +4143,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4455,7 +4158,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4471,8 +4173,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4488,8 +4188,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4512,7 +4210,7 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4537,7 +4235,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4562,7 +4260,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4587,7 +4285,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4612,7 +4310,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4627,7 +4324,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4642,7 +4338,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4657,7 +4352,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4672,7 +4366,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4688,7 +4381,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4704,8 +4396,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4721,8 +4411,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4745,7 +4433,7 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4770,7 +4458,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4795,7 +4483,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4820,7 +4508,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4845,7 +4533,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4860,7 +4547,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4875,7 +4561,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4890,7 +4575,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4905,7 +4589,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4921,7 +4604,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4937,8 +4619,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4954,8 +4634,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4978,7 +4656,7 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5003,7 +4681,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5028,7 +4706,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5053,7 +4731,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5078,7 +4756,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5093,7 +4770,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5108,7 +4784,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5123,7 +4798,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5138,7 +4812,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5154,7 +4827,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5170,8 +4842,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5187,8 +4857,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5211,7 +4879,7 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5236,7 +4904,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5261,7 +4929,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5286,7 +4954,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5311,7 +4979,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5326,7 +4993,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5341,7 +5007,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5356,7 +5021,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5371,7 +5035,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5387,7 +5050,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5403,8 +5065,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5420,8 +5080,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5444,7 +5102,7 @@ entry: define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5469,7 +5127,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5494,7 +5152,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5519,7 +5177,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5544,7 +5202,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5559,7 +5216,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5574,7 +5230,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5589,7 +5244,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5604,7 +5258,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5620,7 +5273,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5636,8 +5288,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5653,8 +5303,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5677,7 +5325,7 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5702,7 +5350,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5727,7 +5375,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5752,7 +5400,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5777,7 +5425,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5792,7 +5439,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5807,7 +5453,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5822,7 +5467,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5837,7 +5481,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5853,7 +5496,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5869,8 +5511,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5886,8 +5526,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5910,7 +5548,7 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5935,7 +5573,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5960,7 +5598,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5985,7 +5623,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -6010,7 +5648,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6025,7 +5662,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6040,7 +5676,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6055,7 +5690,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6070,7 +5704,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6086,7 +5719,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6102,8 +5734,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6119,8 +5749,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6143,6 +5771,7 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6171,6 +5800,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6199,6 +5829,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6227,6 +5858,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6255,7 +5887,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6273,7 +5904,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6291,7 +5921,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6309,7 +5938,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6327,7 +5955,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6347,7 +5974,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6367,8 +5993,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6388,8 +6012,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6418,6 +6040,7 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6446,6 +6069,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6474,6 +6098,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6502,6 +6127,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6530,7 +6156,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6548,7 +6173,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6566,7 +6190,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6584,7 +6207,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6602,7 +6224,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6622,7 +6243,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6642,8 +6262,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6663,8 +6281,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6693,6 +6309,7 @@ entry: define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6721,6 +6338,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6749,6 +6367,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6777,6 +6396,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6805,7 +6425,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6823,7 +6442,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6841,7 +6459,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6859,7 +6476,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6877,7 +6493,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6897,7 +6512,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6917,8 +6531,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6938,8 +6550,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6968,6 +6578,7 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6996,6 +6607,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7024,6 +6636,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7052,6 +6665,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7080,7 +6694,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7098,7 +6711,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7116,7 +6728,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7134,7 +6745,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7152,7 +6762,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7172,7 +6781,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7192,8 +6800,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7213,8 +6819,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7243,6 +6847,7 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7271,6 +6876,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7299,6 +6905,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7327,6 +6934,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7355,7 +6963,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7373,7 +6980,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7391,7 +6997,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7409,7 +7014,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7427,7 +7031,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7447,7 +7050,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7467,8 +7069,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7488,8 +7088,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7518,6 +7116,7 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7546,6 +7145,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7574,6 +7174,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7602,6 +7203,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7630,7 +7232,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7648,7 +7249,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7666,7 +7266,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7684,7 +7283,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7702,7 +7300,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7722,7 +7319,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7742,8 +7338,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7763,8 +7357,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7793,6 +7385,7 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7821,6 +7414,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7849,6 +7443,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7877,6 +7472,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7905,7 +7501,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7923,7 +7518,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7941,7 +7535,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7959,7 +7552,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7977,7 +7569,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7997,7 +7588,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8017,8 +7607,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8038,8 +7626,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8068,6 +7654,7 @@ entry: define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8096,6 +7683,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8124,6 +7712,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8152,6 +7741,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8180,7 +7770,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8198,7 +7787,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8216,7 +7804,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8234,7 +7821,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8252,7 +7838,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8272,7 +7857,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8292,8 +7876,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8313,8 +7895,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8343,6 +7923,7 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8371,6 +7952,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8399,6 +7981,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8427,6 +8010,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8455,7 +8039,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8473,7 +8056,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8491,7 +8073,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8509,7 +8090,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8527,7 +8107,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8547,7 +8126,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8567,8 +8145,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8588,8 +8164,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8618,6 +8192,7 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8646,6 +8221,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8674,6 +8250,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8702,6 +8279,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8730,7 +8308,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8748,7 +8325,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8766,7 +8342,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8784,7 +8359,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8802,7 +8376,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8822,7 +8395,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8842,8 +8414,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8863,8 +8433,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8893,6 +8461,7 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8921,6 +8490,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8949,6 +8519,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8977,6 +8548,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9005,7 +8577,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9023,7 +8594,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9041,7 +8611,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9059,7 +8628,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9077,7 +8645,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9097,7 +8664,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9117,8 +8683,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9138,8 +8702,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9168,6 +8730,7 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9196,6 +8759,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9224,6 +8788,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9252,6 +8817,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9280,7 +8846,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9298,7 +8863,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9316,7 +8880,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9334,7 +8897,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9352,7 +8914,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9372,7 +8933,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9392,8 +8952,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9413,8 +8971,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9443,6 +8999,7 @@ entry: define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9471,6 +9028,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9499,6 +9057,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9527,6 +9086,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9555,7 +9115,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9573,7 +9132,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9591,7 +9149,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9609,7 +9166,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9627,7 +9183,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9647,7 +9202,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9667,8 +9221,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9688,8 +9240,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9718,6 +9268,7 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9746,6 +9297,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9774,6 +9326,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9802,6 +9355,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9830,7 +9384,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9848,7 +9401,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9866,7 +9418,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9884,7 +9435,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9902,7 +9452,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9922,7 +9471,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9942,8 +9490,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9963,8 +9509,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9993,6 +9537,7 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10021,6 +9566,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10049,6 +9595,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10077,6 +9624,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10105,7 +9653,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10123,7 +9670,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10141,7 +9687,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10159,7 +9704,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10177,7 +9721,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10197,7 +9740,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10217,8 +9759,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10238,8 +9778,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10268,9 +9806,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX7-LABEL: flat_wavefront_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10283,10 +9820,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -10299,10 +9834,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -10315,9 +9848,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -10330,10 +9862,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10344,10 +9874,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10358,10 +9886,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10372,10 +9898,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10386,9 +9910,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10401,9 +9924,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10416,10 +9938,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10432,10 +9952,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10455,9 +9973,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10470,10 +9987,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -10486,10 +10001,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -10502,9 +10015,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -10517,10 +10029,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10531,10 +10041,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10545,10 +10053,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10559,10 +10065,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10573,9 +10077,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10588,9 +10091,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10603,10 +10105,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10619,10 +10119,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10642,9 +10140,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX7-LABEL: flat_wavefront_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10657,10 +10154,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -10673,10 +10168,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -10689,9 +10182,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -10704,10 +10196,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10718,10 +10208,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10732,10 +10220,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10746,10 +10232,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10760,9 +10244,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10775,9 +10258,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10790,10 +10272,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10806,10 +10286,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10829,9 +10307,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10844,10 +10321,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -10860,10 +10335,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -10876,9 +10349,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -10891,10 +10363,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10905,10 +10375,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10919,10 +10387,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10933,10 +10399,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10947,9 +10411,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10962,9 +10425,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10977,10 +10439,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10993,10 +10453,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11016,8 +10474,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX7-LABEL: flat_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11027,9 +10485,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11039,9 +10496,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11051,8 +10507,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11062,9 +10518,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11073,9 +10528,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11084,9 +10538,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11095,9 +10548,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11106,8 +10558,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11117,8 +10569,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11128,8 +10580,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11139,8 +10591,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11156,8 +10608,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11167,9 +10619,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11179,9 +10630,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11191,8 +10641,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11202,9 +10652,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11213,9 +10662,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11224,9 +10672,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11235,9 +10682,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11246,8 +10692,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11257,8 +10703,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11268,8 +10714,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11279,8 +10725,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11296,8 +10742,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX7-LABEL: flat_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11307,9 +10753,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11319,9 +10764,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11331,8 +10775,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11342,9 +10786,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11353,9 +10796,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11364,9 +10806,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11375,9 +10816,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11386,8 +10826,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11397,8 +10837,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11408,8 +10848,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11419,8 +10859,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11436,8 +10876,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11447,9 +10887,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11459,9 +10898,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11471,8 +10909,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11482,9 +10920,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11493,9 +10930,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11504,9 +10940,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11515,9 +10950,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11526,8 +10960,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11537,8 +10971,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11548,8 +10982,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11559,8 +10993,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11576,9 +11010,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11588,10 +11021,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11601,10 +11032,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11614,9 +11043,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11626,10 +11054,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11638,10 +11064,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11650,10 +11074,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11662,10 +11084,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11674,9 +11094,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11686,9 +11105,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11698,10 +11116,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11711,10 +11127,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11730,9 +11144,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11742,10 +11155,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11755,10 +11166,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11768,9 +11177,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11780,10 +11188,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11792,10 +11198,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11804,10 +11208,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11816,10 +11218,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11828,9 +11228,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11840,9 +11239,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11852,10 +11250,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11865,10 +11261,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11884,9 +11278,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11896,10 +11289,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11909,10 +11300,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11922,9 +11311,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11934,10 +11322,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11946,10 +11332,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11958,10 +11342,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11970,10 +11352,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11982,9 +11362,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11994,9 +11373,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12006,10 +11384,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12019,10 +11395,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12038,9 +11412,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12050,10 +11423,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12063,10 +11434,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12076,9 +11445,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12088,10 +11456,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12100,10 +11466,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12112,10 +11476,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12124,10 +11486,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12136,9 +11496,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12148,9 +11507,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12160,10 +11518,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12173,10 +11529,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12192,9 +11546,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12204,10 +11557,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12217,10 +11568,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12230,9 +11579,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12242,10 +11590,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12254,10 +11600,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12266,10 +11610,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12278,10 +11620,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12290,9 +11630,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12302,9 +11641,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12314,10 +11652,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12327,10 +11663,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12346,8 +11680,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12361,9 +11695,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -12377,9 +11710,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -12393,8 +11725,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -12408,9 +11740,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12422,9 +11753,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12436,9 +11766,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12450,9 +11779,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12464,8 +11792,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12479,8 +11807,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12494,8 +11822,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12509,8 +11837,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12531,8 +11859,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12546,9 +11874,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -12562,9 +11889,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -12578,8 +11904,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -12593,9 +11919,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12607,9 +11932,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12621,9 +11945,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12635,9 +11958,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12649,8 +11971,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12664,8 +11986,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12679,8 +12001,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12694,8 +12016,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12716,8 +12038,8 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12731,9 +12053,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -12747,9 +12068,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -12763,8 +12083,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -12778,9 +12098,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12792,9 +12111,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12806,9 +12124,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12820,9 +12137,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12834,8 +12150,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12849,8 +12165,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12864,8 +12180,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12879,8 +12195,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12901,7 +12217,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -12926,7 +12242,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -12951,7 +12267,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -12976,7 +12292,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13001,7 +12317,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13016,7 +12331,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13031,7 +12345,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13046,7 +12359,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13061,7 +12373,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13077,7 +12388,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13093,8 +12403,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13110,8 +12418,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13134,7 +12440,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13159,7 +12465,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13184,7 +12490,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13209,7 +12515,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13234,7 +12540,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13249,7 +12554,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13264,7 +12568,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13279,7 +12582,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13294,7 +12596,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13310,7 +12611,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13326,8 +12626,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13343,8 +12641,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13367,7 +12663,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13392,7 +12688,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13417,7 +12713,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13442,7 +12738,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13467,7 +12763,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13482,7 +12777,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13497,7 +12791,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13512,7 +12805,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13527,7 +12819,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13543,7 +12834,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13559,8 +12849,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13576,8 +12864,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13600,7 +12886,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13625,7 +12911,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13650,7 +12936,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13675,7 +12961,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13700,7 +12986,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13715,7 +13000,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13730,7 +13014,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13745,7 +13028,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13760,7 +13042,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13776,7 +13057,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13792,8 +13072,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13809,8 +13087,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13833,7 +13109,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13858,7 +13134,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13883,7 +13159,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13908,7 +13184,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13933,7 +13209,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13948,7 +13223,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13963,7 +13237,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13978,7 +13251,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13993,7 +13265,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14009,7 +13280,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14025,8 +13295,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14042,8 +13310,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14066,7 +13332,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14091,7 +13357,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14116,7 +13382,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14141,7 +13407,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14166,7 +13432,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14181,7 +13446,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14196,7 +13460,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14211,7 +13474,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14226,7 +13488,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14242,7 +13503,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14258,8 +13518,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14275,8 +13533,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14299,7 +13555,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14324,7 +13580,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14349,7 +13605,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14374,7 +13630,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14399,7 +13655,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14414,7 +13669,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14429,7 +13683,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14444,7 +13697,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14459,7 +13711,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14475,7 +13726,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14491,8 +13741,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14508,8 +13756,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14532,7 +13778,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14557,7 +13803,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14582,7 +13828,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14607,7 +13853,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14632,7 +13878,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14647,7 +13892,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14662,7 +13906,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14677,7 +13920,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14692,7 +13934,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14708,7 +13949,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14724,8 +13964,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14741,8 +13979,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14765,7 +14001,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14790,7 +14026,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14815,7 +14051,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14840,7 +14076,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14865,7 +14101,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14880,7 +14115,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14895,7 +14129,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14910,7 +14143,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14925,7 +14157,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14941,7 +14172,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14957,8 +14187,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14974,8 +14202,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14998,7 +14224,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15023,7 +14249,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15048,7 +14274,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15073,7 +14299,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15098,7 +14324,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15113,7 +14338,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15128,7 +14352,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15143,7 +14366,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15158,7 +14380,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15174,7 +14395,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15190,8 +14410,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15207,8 +14425,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15231,7 +14447,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15256,7 +14472,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15281,7 +14497,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15306,7 +14522,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15331,7 +14547,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15346,7 +14561,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15361,7 +14575,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15376,7 +14589,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15391,7 +14603,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15407,7 +14618,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15423,8 +14633,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15440,8 +14648,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15464,7 +14670,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15489,7 +14695,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15514,7 +14720,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15539,7 +14745,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15564,7 +14770,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15579,7 +14784,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15594,7 +14798,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15609,7 +14812,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15624,7 +14826,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15640,7 +14841,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15656,8 +14856,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15673,8 +14871,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15697,7 +14893,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15722,7 +14918,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15747,7 +14943,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15772,7 +14968,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15797,7 +14993,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15812,7 +15007,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15827,7 +15021,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15842,7 +15035,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15857,7 +15049,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15873,7 +15064,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15889,8 +15079,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15906,8 +15094,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15930,7 +15116,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15955,7 +15141,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15980,7 +15166,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16005,7 +15191,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16030,7 +15216,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16045,7 +15230,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16060,7 +15244,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16075,7 +15258,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16090,7 +15272,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16106,7 +15287,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16122,8 +15302,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16139,8 +15317,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16163,7 +15339,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16188,7 +15364,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16213,7 +15389,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16238,7 +15414,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16263,7 +15439,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16278,7 +15453,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16293,7 +15467,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16308,7 +15481,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16323,7 +15495,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16339,7 +15510,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16355,8 +15525,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16372,8 +15540,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16396,6 +15562,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16424,6 +15591,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16452,6 +15620,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16480,6 +15649,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16508,7 +15678,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16526,7 +15695,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16544,7 +15712,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16562,7 +15729,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16580,7 +15746,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16600,7 +15765,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16620,8 +15784,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16641,8 +15803,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16671,6 +15831,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16699,6 +15860,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16727,6 +15889,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16755,6 +15918,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16783,7 +15947,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16801,7 +15964,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16819,7 +15981,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16837,7 +15998,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16855,7 +16015,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16875,7 +16034,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16895,8 +16053,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16916,8 +16072,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16946,6 +16100,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16974,6 +16129,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17002,6 +16158,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17030,6 +16187,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17058,7 +16216,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17076,7 +16233,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17094,7 +16250,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17112,7 +16267,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17130,7 +16284,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17150,7 +16303,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17170,8 +16322,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17191,8 +16341,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17221,6 +16369,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17249,6 +16398,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17277,6 +16427,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17305,6 +16456,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17333,7 +16485,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17351,7 +16502,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17369,7 +16519,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17387,7 +16536,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17405,7 +16553,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17425,7 +16572,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17445,8 +16591,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17466,8 +16610,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17496,6 +16638,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17524,6 +16667,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17552,6 +16696,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17580,6 +16725,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17608,7 +16754,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17626,7 +16771,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17644,7 +16788,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17662,7 +16805,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17680,7 +16822,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17700,7 +16841,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17720,8 +16860,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17741,8 +16879,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17771,6 +16907,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17799,6 +16936,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17827,6 +16965,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17855,6 +16994,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17883,7 +17023,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17901,7 +17040,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17919,7 +17057,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17937,7 +17074,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17955,7 +17091,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17975,7 +17110,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17995,8 +17129,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18016,8 +17148,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18046,6 +17176,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18074,6 +17205,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18102,6 +17234,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18130,6 +17263,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18158,7 +17292,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18176,7 +17309,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18194,7 +17326,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18212,7 +17343,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18230,7 +17360,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18250,7 +17379,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18270,8 +17398,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18291,8 +17417,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18321,6 +17445,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18349,6 +17474,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18377,6 +17503,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18405,6 +17532,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18433,7 +17561,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18451,7 +17578,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18469,7 +17595,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18487,7 +17612,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18505,7 +17629,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18525,7 +17648,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18545,8 +17667,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18566,8 +17686,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18596,6 +17714,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18624,6 +17743,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18652,6 +17772,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18680,6 +17801,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18708,7 +17830,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18726,7 +17847,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18744,7 +17864,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18762,7 +17881,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18780,7 +17898,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18800,7 +17917,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18820,8 +17936,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18841,8 +17955,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18871,6 +17983,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18899,6 +18012,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18927,6 +18041,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18955,6 +18070,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18983,7 +18099,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19001,7 +18116,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19019,7 +18133,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19037,7 +18150,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19055,7 +18167,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19075,7 +18186,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19095,8 +18205,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19116,8 +18224,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19146,6 +18252,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19174,6 +18281,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19202,6 +18310,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19230,6 +18339,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19258,7 +18368,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19276,7 +18385,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19294,7 +18402,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19312,7 +18419,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19330,7 +18436,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19350,7 +18455,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19370,8 +18474,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19391,8 +18493,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19421,6 +18521,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19449,6 +18550,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19477,6 +18579,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19505,6 +18608,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19533,7 +18637,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19551,7 +18654,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19569,7 +18671,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19587,7 +18688,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19605,7 +18705,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19625,7 +18724,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19645,8 +18743,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19666,8 +18762,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19696,6 +18790,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19724,6 +18819,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19752,6 +18848,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19780,6 +18877,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19808,7 +18906,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19826,7 +18923,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19844,7 +18940,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19862,7 +18957,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19880,7 +18974,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19900,7 +18993,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19920,8 +19012,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19941,8 +19031,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19971,6 +19059,7 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19999,6 +19088,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20027,6 +19117,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20055,6 +19146,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20083,7 +19175,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20101,7 +19192,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20119,7 +19209,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20137,7 +19226,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20155,7 +19243,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20175,7 +19262,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20195,8 +19281,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20216,8 +19300,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 4a073a771ac0c0..71dcfa060c83c1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -15,9 +15,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX7-LABEL: flat_workgroup_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -30,10 +29,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX10-WGP-LABEL: flat_workgroup_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -46,10 +43,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX10-CU-LABEL: flat_workgroup_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -62,9 +57,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -77,10 +71,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -91,10 +83,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -105,10 +95,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -119,10 +107,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -133,9 +119,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX11-WGP-LABEL: flat_workgroup_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -148,9 +133,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX11-CU-LABEL: flat_workgroup_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -163,10 +147,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX12-WGP-LABEL: flat_workgroup_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -179,10 +161,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX12-CU-LABEL: flat_workgroup_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -202,9 +182,8 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX7-LABEL: flat_workgroup_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -217,10 +196,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -233,10 +210,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -249,9 +224,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -264,10 +238,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -278,10 +250,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -292,10 +262,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 @@ -306,10 +274,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 @@ -320,9 +286,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -335,9 +300,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -350,10 +314,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX12-WGP-LABEL: flat_workgroup_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -366,10 +328,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -389,9 +349,8 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX7-LABEL: flat_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -405,10 +364,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -422,10 +379,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -439,9 +394,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -455,10 +409,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -470,10 +422,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -485,10 +435,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 @@ -500,10 +448,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 @@ -515,9 +461,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -531,9 +476,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX11-CU-LABEL: flat_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -547,10 +491,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX12-WGP-LABEL: flat_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -564,10 +506,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX12-CU-LABEL: flat_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -588,9 +528,8 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX7-LABEL: flat_workgroup_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -605,10 +544,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -624,10 +561,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -642,9 +577,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -659,10 +593,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -675,10 +607,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -691,10 +621,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -707,10 +635,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -723,9 +649,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -741,9 +666,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -758,10 +682,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -781,10 +703,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -806,8 +726,8 @@ entry: define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX7-LABEL: flat_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -817,9 +737,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX10-WGP-LABEL: flat_workgroup_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -829,9 +748,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX10-CU-LABEL: flat_workgroup_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -841,8 +759,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -852,9 +770,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -863,9 +780,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -874,9 +790,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -885,9 +800,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -896,8 +810,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX11-WGP-LABEL: flat_workgroup_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -907,8 +821,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX11-CU-LABEL: flat_workgroup_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -918,8 +832,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX12-WGP-LABEL: flat_workgroup_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -929,8 +843,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX12-CU-LABEL: flat_workgroup_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -946,8 +860,8 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX7-LABEL: flat_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -957,9 +871,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -969,9 +882,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -981,8 +893,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -992,9 +904,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1003,9 +914,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1014,9 +924,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1025,9 +934,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1036,8 +944,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1047,8 +955,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1058,8 +966,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX12-WGP-LABEL: flat_workgroup_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1069,8 +977,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1086,8 +994,8 @@ entry: define amdgpu_kernel void @flat_workgroup_release_store( ; GFX7-LABEL: flat_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1098,9 +1006,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX10-WGP-LABEL: flat_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1112,9 +1019,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX10-CU-LABEL: flat_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1125,8 +1031,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1137,9 +1043,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1149,9 +1054,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1161,9 +1065,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1173,9 +1076,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1185,8 +1087,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX11-WGP-LABEL: flat_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1198,8 +1100,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX11-CU-LABEL: flat_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1210,8 +1112,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX12-WGP-LABEL: flat_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1225,8 +1127,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX12-CU-LABEL: flat_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1243,8 +1145,8 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX7-LABEL: flat_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1255,9 +1157,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1269,9 +1170,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1282,8 +1182,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1294,9 +1194,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1306,9 +1205,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1318,9 +1216,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1330,9 +1227,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1342,8 +1238,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1355,8 +1251,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1367,8 +1263,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1382,8 +1278,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1400,9 +1296,8 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1412,10 +1307,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1425,10 +1318,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1438,9 +1329,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1450,10 +1340,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1462,10 +1350,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1474,10 +1360,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1486,10 +1370,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1498,9 +1380,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1510,9 +1391,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1522,10 +1402,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1535,10 +1413,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1554,9 +1430,8 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX7-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1567,10 +1442,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1583,10 +1456,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1597,9 +1468,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1610,10 +1480,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1623,10 +1491,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1637,10 +1503,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1650,10 +1514,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1664,9 +1526,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1679,9 +1540,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1692,10 +1552,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1707,10 +1565,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1727,9 +1583,8 @@ entry: define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX7-LABEL: flat_workgroup_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1740,10 +1595,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1755,10 +1608,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1769,9 +1620,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1782,10 +1632,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1795,10 +1643,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1808,10 +1654,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1821,10 +1665,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -1834,9 +1676,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1848,9 +1689,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1861,10 +1701,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -1878,10 +1716,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -1898,9 +1734,8 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1912,10 +1747,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -1930,10 +1763,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -1945,9 +1776,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -1959,10 +1789,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1973,10 +1801,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -1988,10 +1814,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2002,10 +1826,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2017,9 +1839,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2034,9 +1855,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2048,10 +1868,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2067,10 +1885,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2088,9 +1904,8 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -2102,10 +1917,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -2120,10 +1933,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -2135,9 +1946,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -2149,10 +1959,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2163,10 +1971,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -2178,10 +1984,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2192,10 +1996,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -2207,9 +2009,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2224,9 +2025,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2238,10 +2038,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -2257,10 +2055,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -2278,8 +2074,8 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2294,9 +2090,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2311,9 +2106,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2328,8 +2122,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2344,9 +2138,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2359,9 +2152,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2374,9 +2166,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2389,9 +2180,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2404,8 +2194,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2420,8 +2210,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2436,8 +2226,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2452,8 +2242,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2475,8 +2265,8 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2492,9 +2282,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2511,9 +2300,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2529,8 +2317,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2546,9 +2334,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2562,9 +2349,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2578,9 +2364,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2594,9 +2379,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2610,8 +2394,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2628,8 +2412,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2645,8 +2429,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2667,8 +2451,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2691,8 +2475,8 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2708,9 +2492,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2727,9 +2510,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2745,8 +2527,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -2762,9 +2544,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2778,9 +2559,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -2794,9 +2574,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2810,9 +2589,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -2826,8 +2604,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2844,8 +2622,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2861,8 +2639,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2883,8 +2661,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2907,7 +2685,7 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -2932,7 +2710,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -2957,7 +2735,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -2982,7 +2760,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3007,7 +2785,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3022,7 +2799,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3037,7 +2813,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3052,7 +2827,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3067,7 +2841,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3083,7 +2856,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3099,8 +2871,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3116,8 +2886,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3140,7 +2908,7 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3166,7 +2934,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3194,7 +2962,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3220,7 +2988,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3246,7 +3014,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3262,7 +3029,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3279,7 +3045,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3295,7 +3060,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3312,7 +3076,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3331,7 +3094,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3348,8 +3110,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3367,8 +3127,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3392,7 +3150,7 @@ entry: define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3418,7 +3176,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3445,7 +3203,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3471,7 +3229,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3497,7 +3255,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3513,7 +3270,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3529,7 +3285,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3545,7 +3300,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3561,7 +3315,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3579,7 +3332,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3596,8 +3348,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3617,8 +3367,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3642,7 +3390,7 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3669,7 +3417,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3699,7 +3447,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3726,7 +3474,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -3753,7 +3501,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3770,7 +3517,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3788,7 +3534,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3805,7 +3550,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -3823,7 +3567,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3844,7 +3587,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3862,8 +3604,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3885,8 +3625,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -3911,7 +3649,7 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3938,7 +3676,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3968,7 +3706,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -3995,7 +3733,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4022,7 +3760,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4039,7 +3776,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4057,7 +3793,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4074,7 +3809,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4092,7 +3826,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4113,7 +3846,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4131,8 +3863,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4154,8 +3884,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4180,7 +3908,7 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4206,7 +3934,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4234,7 +3962,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4260,7 +3988,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4286,7 +4014,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4302,7 +4029,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4319,7 +4045,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4335,7 +4060,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4352,7 +4076,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4371,7 +4094,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4388,8 +4110,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4407,8 +4127,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4432,7 +4150,7 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4458,7 +4176,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4486,7 +4204,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4512,7 +4230,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4538,7 +4256,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4554,7 +4271,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4571,7 +4287,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4587,7 +4302,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4604,7 +4318,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4623,7 +4336,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4640,8 +4352,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4659,8 +4369,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4684,7 +4392,7 @@ entry: define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4711,7 +4419,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4741,7 +4449,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -4768,7 +4476,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -4795,7 +4503,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4812,7 +4519,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4830,7 +4536,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4847,7 +4552,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -4865,7 +4569,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4886,7 +4589,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4904,8 +4606,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4927,8 +4627,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -4953,7 +4651,7 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4980,7 +4678,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5010,7 +4708,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5037,7 +4735,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5064,7 +4762,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5081,7 +4778,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5099,7 +4795,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5116,7 +4811,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5134,7 +4828,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5155,7 +4848,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5173,8 +4865,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5196,8 +4886,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5222,7 +4910,7 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5249,7 +4937,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5279,7 +4967,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5306,7 +4994,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5333,7 +5021,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5350,7 +5037,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5368,7 +5054,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5385,7 +5070,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5403,7 +5087,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5424,7 +5107,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5442,8 +5124,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5465,8 +5145,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5491,7 +5169,7 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5518,7 +5196,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5548,7 +5226,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -5575,7 +5253,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -5602,7 +5280,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5619,7 +5296,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5637,7 +5313,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5654,7 +5329,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5672,7 +5346,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5693,7 +5366,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5711,8 +5383,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5734,8 +5404,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5760,6 +5428,7 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5788,6 +5457,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -5816,6 +5486,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -5844,6 +5515,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5872,7 +5544,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5890,7 +5561,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5908,7 +5578,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5926,7 +5595,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -5944,7 +5612,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5964,7 +5631,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -5984,8 +5650,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6005,8 +5669,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6035,6 +5697,7 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6064,6 +5727,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6093,6 +5757,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6122,6 +5787,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6151,7 +5817,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6170,7 +5835,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6189,7 +5853,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6208,7 +5871,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6227,7 +5889,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6248,7 +5909,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6269,8 +5929,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6291,8 +5949,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6322,6 +5978,7 @@ entry: define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6351,6 +6008,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6381,6 +6039,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6410,6 +6069,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6439,7 +6099,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6458,7 +6117,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6477,7 +6135,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6496,7 +6153,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6515,7 +6171,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6537,7 +6192,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6558,8 +6212,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6583,8 +6235,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6614,6 +6264,7 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6644,6 +6295,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6675,6 +6327,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6705,6 +6358,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6735,7 +6389,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6755,7 +6408,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6775,7 +6427,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6795,7 +6446,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -6815,7 +6465,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6838,7 +6487,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6860,8 +6508,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6888,8 +6534,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -6920,6 +6564,7 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6950,6 +6595,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -6981,6 +6627,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7011,6 +6658,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7041,7 +6689,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7061,7 +6708,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7081,7 +6727,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7101,7 +6746,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7121,7 +6765,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7144,7 +6787,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7166,8 +6808,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7194,8 +6834,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7226,6 +6864,7 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7255,6 +6894,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7284,6 +6924,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7313,6 +6954,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7342,7 +6984,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7361,7 +7002,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7380,7 +7020,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7399,7 +7038,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7418,7 +7056,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7439,7 +7076,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7460,8 +7096,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7484,8 +7118,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7515,6 +7147,7 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7544,6 +7177,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7573,6 +7207,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7602,6 +7237,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7631,7 +7267,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7650,7 +7285,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7669,7 +7303,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7688,7 +7321,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7707,7 +7339,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7728,7 +7359,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7749,8 +7379,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7771,8 +7399,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -7802,6 +7428,7 @@ entry: define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7832,6 +7459,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7863,6 +7491,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -7893,6 +7522,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7923,7 +7553,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7943,7 +7572,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7963,7 +7591,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -7983,7 +7610,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8003,7 +7629,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8026,7 +7651,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8048,8 +7672,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8076,8 +7698,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8108,6 +7728,7 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8138,6 +7759,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8169,6 +7791,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8199,6 +7822,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8229,7 +7853,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8249,7 +7872,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8269,7 +7891,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8289,7 +7910,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8309,7 +7929,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8332,7 +7951,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8354,8 +7972,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8382,8 +7998,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8414,6 +8028,7 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8444,6 +8059,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8475,6 +8091,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8505,6 +8122,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8535,7 +8153,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8555,7 +8172,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8575,7 +8191,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8595,7 +8210,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8615,7 +8229,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8638,7 +8251,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8660,8 +8272,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8688,8 +8298,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8720,6 +8328,7 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8750,6 +8359,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8781,6 +8391,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -8811,6 +8422,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8841,7 +8453,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8861,7 +8472,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8881,7 +8491,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8901,7 +8510,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -8921,7 +8529,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8944,7 +8551,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8966,8 +8572,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -8994,8 +8598,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9026,6 +8628,7 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9056,6 +8659,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9087,6 +8691,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9117,6 +8722,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9147,7 +8753,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9167,7 +8772,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9187,7 +8791,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9207,7 +8810,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9227,7 +8829,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9250,7 +8851,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9272,8 +8872,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9298,8 +8896,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9330,6 +8926,7 @@ entry: define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9360,6 +8957,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9391,6 +8989,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9421,6 +9020,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9451,7 +9051,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9471,7 +9070,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9491,7 +9089,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9511,7 +9108,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9531,7 +9127,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9554,7 +9149,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9576,8 +9170,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9604,8 +9196,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9636,6 +9226,7 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9666,6 +9257,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9697,6 +9289,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -9727,6 +9320,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9757,7 +9351,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9777,7 +9370,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9797,7 +9389,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9817,7 +9408,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -9837,7 +9427,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9860,7 +9449,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9882,8 +9470,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9910,8 +9496,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -9942,6 +9526,7 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9972,6 +9557,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10003,6 +9589,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -10033,6 +9620,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10063,7 +9651,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10083,7 +9670,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10103,7 +9689,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10123,7 +9708,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -10143,7 +9727,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10166,7 +9749,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10188,8 +9770,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10216,8 +9796,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -10248,9 +9826,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX7-LABEL: flat_workgroup_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10263,10 +9840,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -10279,10 +9854,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -10295,9 +9868,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -10310,10 +9882,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10324,10 +9894,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10338,10 +9906,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10352,10 +9918,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10366,9 +9930,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10381,9 +9944,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10396,10 +9958,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10412,10 +9972,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10435,9 +9993,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10450,10 +10007,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -10466,10 +10021,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -10482,9 +10035,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -10497,10 +10049,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10511,10 +10061,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -10525,10 +10073,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 @@ -10539,10 +10085,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 @@ -10553,9 +10097,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10568,9 +10111,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10583,10 +10125,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10599,10 +10139,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10622,9 +10160,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX7-LABEL: flat_workgroup_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10637,10 +10174,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -10655,10 +10190,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -10671,9 +10204,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -10686,10 +10218,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10700,10 +10230,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc @@ -10715,10 +10243,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 @@ -10729,10 +10255,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 @@ -10744,9 +10268,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10761,9 +10284,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10776,10 +10298,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10794,10 +10314,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10817,9 +10335,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10832,10 +10349,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -10852,10 +10367,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -10868,9 +10381,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -10883,10 +10395,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] @@ -10897,10 +10407,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10913,10 +10421,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 @@ -10927,10 +10433,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10943,9 +10447,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -10962,9 +10465,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -10977,10 +10479,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11001,10 +10501,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11024,8 +10522,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX7-LABEL: flat_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11035,9 +10533,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11047,9 +10544,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11059,8 +10555,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11070,9 +10566,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11081,9 +10576,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11092,9 +10586,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11103,9 +10596,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11114,8 +10606,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11125,8 +10617,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11136,8 +10628,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11147,8 +10639,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11164,8 +10656,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11175,9 +10667,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11187,9 +10678,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11199,8 +10689,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11210,9 +10700,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11221,9 +10710,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11232,9 +10720,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11243,9 +10730,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11254,8 +10740,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11265,8 +10751,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11276,8 +10762,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11287,8 +10773,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11304,8 +10790,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX7-LABEL: flat_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11315,9 +10801,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11329,9 +10814,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11341,8 +10825,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11352,9 +10836,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11363,9 +10846,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11375,9 +10857,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11386,9 +10867,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11398,8 +10878,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11411,8 +10891,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11422,8 +10902,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11437,8 +10917,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11454,8 +10934,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11465,9 +10945,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11479,9 +10958,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11491,8 +10969,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11502,9 +10980,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11513,9 +10990,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11525,9 +11001,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11536,9 +11011,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11548,8 +11022,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11561,8 +11035,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11572,8 +11046,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11587,8 +11061,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11604,9 +11078,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11616,10 +11089,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11629,10 +11100,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11642,9 +11111,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11654,10 +11122,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11666,10 +11132,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11678,10 +11142,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11690,10 +11152,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11702,9 +11162,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11714,9 +11173,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11726,10 +11184,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11739,10 +11195,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11758,9 +11212,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11770,10 +11223,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11785,10 +11236,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11798,9 +11247,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11810,10 +11258,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11822,10 +11268,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11836,10 +11280,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11848,10 +11290,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -11862,9 +11302,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11876,9 +11315,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11888,10 +11326,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -11903,10 +11339,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -11922,9 +11356,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11934,10 +11367,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -11949,10 +11380,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -11962,9 +11391,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -11974,10 +11402,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11986,10 +11412,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -11999,10 +11423,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12011,10 +11433,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12024,9 +11444,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12038,9 +11457,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12050,10 +11468,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12067,10 +11483,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12086,9 +11500,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12098,10 +11511,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12115,10 +11526,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12128,9 +11537,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12140,10 +11548,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12152,10 +11558,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12167,10 +11571,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12179,10 +11581,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12194,9 +11594,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12210,9 +11609,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12222,10 +11620,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12241,10 +11637,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12260,9 +11654,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12272,10 +11665,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 @@ -12289,10 +11680,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 @@ -12302,9 +11691,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 @@ -12314,10 +11702,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12326,10 +11712,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 @@ -12341,10 +11725,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12353,10 +11735,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 @@ -12368,9 +11748,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12384,9 +11763,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12396,10 +11774,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 @@ -12415,10 +11791,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 @@ -12434,8 +11808,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12449,9 +11823,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -12467,9 +11840,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -12483,8 +11855,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -12498,9 +11870,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12512,9 +11883,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12527,9 +11897,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12541,9 +11910,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12556,8 +11924,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12573,8 +11941,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12588,8 +11956,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12605,8 +11973,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12627,8 +11995,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12642,9 +12010,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -12662,9 +12029,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -12678,8 +12044,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -12693,9 +12059,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12707,9 +12072,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12723,9 +12087,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12737,9 +12100,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12753,8 +12115,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12772,8 +12134,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12787,8 +12149,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12810,8 +12172,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12832,8 +12194,8 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12847,9 +12209,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -12867,9 +12228,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -12883,8 +12243,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -12898,9 +12258,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12912,9 +12271,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 @@ -12928,9 +12286,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12942,9 +12299,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -12958,8 +12314,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -12977,8 +12333,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -12992,8 +12348,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -13015,8 +12371,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -13037,7 +12393,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13062,7 +12418,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13087,7 +12443,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13112,7 +12468,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13137,7 +12493,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13152,7 +12507,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13167,7 +12521,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13182,7 +12535,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13197,7 +12549,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13213,7 +12564,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13229,8 +12579,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13246,8 +12594,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13270,7 +12616,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13295,7 +12641,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13322,7 +12668,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13347,7 +12693,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13372,7 +12718,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13387,7 +12732,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13404,7 +12748,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13419,7 +12762,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13436,7 +12778,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13454,7 +12795,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13470,8 +12810,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13489,8 +12827,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13513,7 +12849,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13538,7 +12874,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13565,7 +12901,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13590,7 +12926,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13615,7 +12951,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13630,7 +12965,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13646,7 +12980,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13661,7 +12994,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13677,7 +13009,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13695,7 +13026,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13711,8 +13041,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13732,8 +13060,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13756,7 +13082,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13781,7 +13107,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13810,7 +13136,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -13835,7 +13161,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -13860,7 +13186,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13875,7 +13200,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13893,7 +13217,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13908,7 +13231,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -13926,7 +13248,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13946,7 +13267,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13962,8 +13282,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -13985,8 +13303,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14009,7 +13325,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14034,7 +13350,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14063,7 +13379,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14088,7 +13404,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14113,7 +13429,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14128,7 +13443,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14146,7 +13460,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14161,7 +13474,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14179,7 +13491,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14199,7 +13510,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14215,8 +13525,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14238,8 +13546,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14262,7 +13568,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14287,7 +13593,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14314,7 +13620,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14339,7 +13645,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14364,7 +13670,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14379,7 +13684,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14396,7 +13700,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14411,7 +13714,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14428,7 +13730,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14446,7 +13747,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14462,8 +13762,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14481,8 +13779,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14505,7 +13801,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14530,7 +13826,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14557,7 +13853,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14582,7 +13878,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14607,7 +13903,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14622,7 +13917,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14639,7 +13933,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14654,7 +13947,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14671,7 +13963,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14689,7 +13980,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14705,8 +13995,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14724,8 +14012,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14748,7 +14034,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14773,7 +14059,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14802,7 +14088,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -14827,7 +14113,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -14852,7 +14138,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14867,7 +14152,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14885,7 +14169,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14900,7 +14183,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -14918,7 +14200,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14938,7 +14219,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14954,8 +14234,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -14977,8 +14255,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15001,7 +14277,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15026,7 +14302,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15055,7 +14331,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15080,7 +14356,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15105,7 +14381,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15120,7 +14395,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15138,7 +14412,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15153,7 +14426,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15171,7 +14443,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15191,7 +14462,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15207,8 +14477,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15230,8 +14498,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15254,7 +14520,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15279,7 +14545,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15308,7 +14574,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15333,7 +14599,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15358,7 +14624,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15373,7 +14638,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15391,7 +14655,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15406,7 +14669,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15424,7 +14686,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15444,7 +14705,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15460,8 +14720,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15483,8 +14741,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15507,7 +14763,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15532,7 +14788,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15561,7 +14817,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15586,7 +14842,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15611,7 +14867,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15626,7 +14881,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15644,7 +14898,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15659,7 +14912,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15677,7 +14929,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15697,7 +14948,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15713,8 +14963,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15736,8 +14984,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15760,7 +15006,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15785,7 +15031,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15814,7 +15060,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -15839,7 +15085,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -15864,7 +15110,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15879,7 +15124,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15897,7 +15141,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15912,7 +15155,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -15930,7 +15172,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15950,7 +15191,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15966,8 +15206,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -15989,8 +15227,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16013,7 +15249,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16038,7 +15274,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16067,7 +15303,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16092,7 +15328,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16117,7 +15353,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16132,7 +15367,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16150,7 +15384,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16165,7 +15398,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16183,7 +15415,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16203,7 +15434,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16219,8 +15449,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16242,8 +15470,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16266,7 +15492,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16291,7 +15517,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16320,7 +15546,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16345,7 +15571,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16370,7 +15596,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16385,7 +15610,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16403,7 +15627,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16418,7 +15641,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16436,7 +15658,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16456,7 +15677,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16472,8 +15692,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16495,8 +15713,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16519,7 +15735,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16544,7 +15760,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16573,7 +15789,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc @@ -16598,7 +15814,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 @@ -16623,7 +15839,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16638,7 +15853,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16656,7 +15870,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16671,7 +15884,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16689,7 +15901,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16709,7 +15920,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16725,8 +15935,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16748,8 +15956,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16772,6 +15978,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16800,6 +16007,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16828,6 +16036,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -16856,6 +16065,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16884,7 +16094,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16902,7 +16111,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16920,7 +16128,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16938,7 +16145,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -16956,7 +16162,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16976,7 +16181,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -16996,8 +16200,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17017,8 +16219,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17047,6 +16247,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17075,6 +16276,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17105,6 +16307,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17133,6 +16336,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17161,7 +16365,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17179,7 +16382,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17198,7 +16400,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17216,7 +16417,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17235,7 +16435,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17257,7 +16456,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17277,8 +16475,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17300,8 +16496,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17330,6 +16524,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17358,6 +16553,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17388,6 +16584,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17416,6 +16613,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17444,7 +16642,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17462,7 +16659,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17481,7 +16677,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17499,7 +16694,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17518,7 +16712,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17540,7 +16733,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17560,8 +16752,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17585,8 +16775,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17615,6 +16803,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17643,6 +16832,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17675,6 +16865,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17703,6 +16894,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17731,7 +16923,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17749,7 +16940,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17769,7 +16959,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17787,7 +16976,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -17807,7 +16995,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17831,7 +17018,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17851,8 +17037,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17880,8 +17064,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -17910,6 +17092,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17938,6 +17121,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17970,6 +17154,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -17998,6 +17183,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18026,7 +17212,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18044,7 +17229,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18064,7 +17248,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18082,7 +17265,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18102,7 +17284,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18126,7 +17307,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18146,8 +17326,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18175,8 +17353,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18205,6 +17381,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18233,6 +17410,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18263,6 +17441,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18291,6 +17470,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18319,7 +17499,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18337,7 +17516,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18356,7 +17534,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18374,7 +17551,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18393,7 +17569,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18415,7 +17590,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18435,8 +17609,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18460,8 +17632,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18490,6 +17660,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18518,6 +17689,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18548,6 +17720,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18576,6 +17749,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18604,7 +17778,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18622,7 +17795,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18641,7 +17813,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18659,7 +17830,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18678,7 +17848,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18700,7 +17869,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18720,8 +17888,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18743,8 +17909,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18773,6 +17937,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18801,6 +17966,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18833,6 +17999,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -18861,6 +18028,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18889,7 +18057,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18907,7 +18074,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18927,7 +18093,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18945,7 +18110,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -18965,7 +18129,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -18989,7 +18152,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19009,8 +18171,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19038,8 +18198,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19068,6 +18226,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19096,6 +18255,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19128,6 +18288,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19156,6 +18317,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19184,7 +18346,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19202,7 +18363,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19222,7 +18382,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19240,7 +18399,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19260,7 +18418,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19284,7 +18441,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19304,8 +18460,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19333,8 +18487,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19363,6 +18515,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19391,6 +18544,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19423,6 +18577,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19451,6 +18606,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19479,7 +18635,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19497,7 +18652,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19517,7 +18671,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19535,7 +18688,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19555,7 +18707,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19579,7 +18730,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19599,8 +18749,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19628,8 +18776,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19658,6 +18804,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19686,6 +18833,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19718,6 +18866,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -19746,6 +18895,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19774,7 +18924,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19792,7 +18941,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19812,7 +18960,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19830,7 +18977,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -19850,7 +18996,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19874,7 +19019,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19894,8 +19038,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19923,8 +19065,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -19953,6 +19093,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19981,6 +19122,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20013,6 +19155,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20041,6 +19184,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20069,7 +19213,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20087,7 +19230,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20107,7 +19249,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20125,7 +19266,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20145,7 +19285,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20169,7 +19308,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20189,8 +19327,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20216,8 +19352,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20246,6 +19380,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20274,6 +19409,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20306,6 +19442,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20334,6 +19471,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20362,7 +19500,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20380,7 +19517,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20400,7 +19536,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20418,7 +19553,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20438,7 +19572,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20462,7 +19595,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20482,8 +19614,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20511,8 +19641,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20541,6 +19669,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20569,6 +19698,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20601,6 +19731,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20629,6 +19760,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20657,7 +19789,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20675,7 +19806,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20695,7 +19825,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20713,7 +19842,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -20733,7 +19861,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20757,7 +19884,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20777,8 +19903,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20806,8 +19930,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -20836,6 +19958,7 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20864,6 +19987,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20896,6 +20020,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc @@ -20924,6 +20049,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20952,7 +20078,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20970,7 +20095,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20990,7 +20114,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21008,7 +20131,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc @@ -21028,7 +20150,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21052,7 +20173,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21072,8 +20192,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc @@ -21101,8 +20219,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index ddfc232bdf55b3..586b8ec05f30cf 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -16,8 +16,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; GFX6-LABEL: global_agent_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -40,9 +41,8 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX7-LABEL: global_agent_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -55,11 +55,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX10-WGP-LABEL: global_agent_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -68,11 +66,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX10-CU-LABEL: global_agent_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -81,8 +77,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; SKIP-CACHE-INV-LABEL: global_agent_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -105,11 +102,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -118,11 +113,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: global_agent_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -131,11 +124,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -144,11 +135,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX940-TGSPLIT-LABEL: global_agent_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -157,10 +146,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX11-WGP-LABEL: global_agent_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -169,10 +157,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX11-CU-LABEL: global_agent_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -181,11 +168,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX12-WGP-LABEL: global_agent_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -194,11 +179,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX12-CU-LABEL: global_agent_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -214,8 +197,9 @@ entry: define amdgpu_kernel void @global_agent_monotonic_load( ; GFX6-LABEL: global_agent_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -238,9 +222,8 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX7-LABEL: global_agent_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -253,11 +236,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX10-WGP-LABEL: global_agent_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -266,11 +247,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX10-CU-LABEL: global_agent_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -279,8 +258,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -303,11 +283,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -316,11 +294,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -329,11 +305,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -342,11 +316,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -355,10 +327,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX11-WGP-LABEL: global_agent_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -367,10 +338,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX11-CU-LABEL: global_agent_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -379,11 +349,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX12-WGP-LABEL: global_agent_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -392,11 +360,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX12-CU-LABEL: global_agent_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -412,8 +378,9 @@ entry: define amdgpu_kernel void @global_agent_acquire_load( ; GFX6-LABEL: global_agent_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -437,9 +404,8 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX7-LABEL: global_agent_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -453,11 +419,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX10-WGP-LABEL: global_agent_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -468,11 +432,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX10-CU-LABEL: global_agent_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -483,8 +445,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -507,11 +470,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -521,11 +482,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -535,11 +494,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -549,11 +506,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -563,10 +518,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX11-WGP-LABEL: global_agent_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -577,10 +531,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX11-CU-LABEL: global_agent_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -591,11 +544,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX12-WGP-LABEL: global_agent_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -605,11 +556,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX12-CU-LABEL: global_agent_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -626,8 +575,9 @@ entry: define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX6-LABEL: global_agent_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -652,9 +602,8 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX7-LABEL: global_agent_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -669,11 +618,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc @@ -685,11 +632,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX10-CU-LABEL: global_agent_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc @@ -701,8 +646,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -726,11 +672,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -740,11 +684,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -754,11 +696,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -768,11 +708,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -782,10 +720,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc @@ -797,10 +734,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX11-CU-LABEL: global_agent_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc @@ -812,11 +748,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -832,11 +766,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX12-CU-LABEL: global_agent_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -859,8 +791,9 @@ entry: define amdgpu_kernel void @global_agent_unordered_store( ; GFX6-LABEL: global_agent_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -876,8 +809,8 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX7-LABEL: global_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -887,10 +820,8 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX10-WGP-LABEL: global_agent_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -899,10 +830,8 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX10-CU-LABEL: global_agent_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -911,8 +840,9 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; SKIP-CACHE-INV-LABEL: global_agent_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -928,10 +858,8 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -940,10 +868,8 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: global_agent_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -952,10 +878,8 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -964,10 +888,8 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX940-TGSPLIT-LABEL: global_agent_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -976,9 +898,8 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX11-WGP-LABEL: global_agent_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -987,9 +908,8 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX11-CU-LABEL: global_agent_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -998,10 +918,8 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX12-WGP-LABEL: global_agent_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1010,10 +928,8 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX12-CU-LABEL: global_agent_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1028,8 +944,9 @@ entry: define amdgpu_kernel void @global_agent_monotonic_store( ; GFX6-LABEL: global_agent_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1045,8 +962,8 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX7-LABEL: global_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1056,10 +973,8 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX10-WGP-LABEL: global_agent_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1068,10 +983,8 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX10-CU-LABEL: global_agent_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1080,8 +993,9 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1097,10 +1011,8 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1109,10 +1021,8 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1121,10 +1031,8 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1133,10 +1041,8 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1145,9 +1051,8 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX11-WGP-LABEL: global_agent_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1156,9 +1061,8 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX11-CU-LABEL: global_agent_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1167,10 +1071,8 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX12-WGP-LABEL: global_agent_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1179,10 +1081,8 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX12-CU-LABEL: global_agent_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1197,8 +1097,9 @@ entry: define amdgpu_kernel void @global_agent_release_store( ; GFX6-LABEL: global_agent_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1215,8 +1116,8 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX7-LABEL: global_agent_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1227,10 +1128,8 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX10-WGP-LABEL: global_agent_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1241,10 +1140,8 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX10-CU-LABEL: global_agent_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1255,8 +1152,9 @@ define amdgpu_kernel void @global_agent_release_store( ; ; SKIP-CACHE-INV-LABEL: global_agent_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1273,10 +1171,8 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1286,10 +1182,8 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1299,10 +1193,8 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1313,10 +1205,8 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX940-TGSPLIT-LABEL: global_agent_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1327,9 +1217,8 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX11-WGP-LABEL: global_agent_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1340,9 +1229,8 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX11-CU-LABEL: global_agent_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1353,10 +1241,8 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX12-WGP-LABEL: global_agent_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1369,10 +1255,8 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX12-CU-LABEL: global_agent_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1391,8 +1275,9 @@ entry: define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX6-LABEL: global_agent_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1409,8 +1294,8 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX7-LABEL: global_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1421,10 +1306,8 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1435,10 +1318,8 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX10-CU-LABEL: global_agent_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1449,8 +1330,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1467,10 +1349,8 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1480,10 +1360,8 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1493,10 +1371,8 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1507,10 +1383,8 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1521,9 +1395,8 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1534,9 +1407,8 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX11-CU-LABEL: global_agent_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1547,10 +1419,8 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1563,10 +1433,8 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX12-CU-LABEL: global_agent_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1585,8 +1453,8 @@ entry: define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX6-LABEL: global_agent_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1602,9 +1470,8 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX7-LABEL: global_agent_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1615,9 +1482,8 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX10-WGP-LABEL: global_agent_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1626,9 +1492,8 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX10-CU-LABEL: global_agent_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1636,8 +1501,8 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1654,9 +1519,8 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1665,9 +1529,8 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1676,9 +1539,8 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1687,9 +1549,8 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1698,8 +1559,8 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX11-WGP-LABEL: global_agent_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1708,8 +1569,8 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX11-CU-LABEL: global_agent_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1718,8 +1579,8 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX12-WGP-LABEL: global_agent_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV @@ -1728,8 +1589,8 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX12-CU-LABEL: global_agent_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV @@ -1743,8 +1604,8 @@ entry: define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX6-LABEL: global_agent_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1762,9 +1623,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX7-LABEL: global_agent_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1777,9 +1637,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX10-WGP-LABEL: global_agent_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1791,9 +1650,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX10-CU-LABEL: global_agent_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1804,8 +1662,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1823,9 +1681,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1836,9 +1693,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1849,9 +1705,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1862,9 +1717,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1875,8 +1729,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX11-WGP-LABEL: global_agent_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1888,8 +1742,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX11-CU-LABEL: global_agent_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1901,8 +1755,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX12-WGP-LABEL: global_agent_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV @@ -1913,8 +1767,8 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX12-CU-LABEL: global_agent_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV @@ -1930,8 +1784,8 @@ entry: define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX6-LABEL: global_agent_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1948,9 +1802,8 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX7-LABEL: global_agent_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1962,9 +1815,8 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX10-WGP-LABEL: global_agent_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1975,9 +1827,8 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX10-CU-LABEL: global_agent_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1987,8 +1838,8 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2006,9 +1857,8 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2018,9 +1868,8 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2030,9 +1879,8 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 @@ -2043,9 +1891,8 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 @@ -2056,8 +1903,8 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX11-WGP-LABEL: global_agent_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2068,8 +1915,8 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX11-CU-LABEL: global_agent_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2080,8 +1927,8 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX12-WGP-LABEL: global_agent_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -2094,8 +1941,8 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX12-CU-LABEL: global_agent_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -2113,8 +1960,8 @@ entry: define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX6-LABEL: global_agent_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2133,9 +1980,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_agent_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -2149,9 +1995,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX10-WGP-LABEL: global_agent_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2165,9 +2010,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX10-CU-LABEL: global_agent_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2180,8 +2024,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2200,9 +2044,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2214,9 +2057,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2228,9 +2070,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 @@ -2243,9 +2084,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 @@ -2258,8 +2098,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: global_agent_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2273,8 +2113,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX11-CU-LABEL: global_agent_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2288,8 +2128,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: global_agent_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -2304,8 +2144,8 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX12-CU-LABEL: global_agent_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -2325,8 +2165,8 @@ entry: define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX6-LABEL: global_agent_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2345,9 +2185,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_agent_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -2361,9 +2200,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX10-WGP-LABEL: global_agent_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2377,9 +2215,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX10-CU-LABEL: global_agent_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2392,8 +2229,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2412,9 +2249,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2426,9 +2262,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2440,9 +2275,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 @@ -2455,9 +2289,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 @@ -2470,8 +2303,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: global_agent_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2485,8 +2318,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX11-CU-LABEL: global_agent_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2500,8 +2333,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: global_agent_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -2516,8 +2349,8 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX12-CU-LABEL: global_agent_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -2537,8 +2370,8 @@ entry: define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX6-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2557,8 +2390,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2574,9 +2407,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2589,9 +2421,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2603,8 +2434,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2623,9 +2454,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2637,9 +2467,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2651,9 +2480,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2665,9 +2493,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2679,8 +2506,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2693,8 +2520,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2707,8 +2534,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -2720,8 +2547,8 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -2739,8 +2566,8 @@ entry: define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2760,8 +2587,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2778,9 +2605,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2795,9 +2621,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2811,8 +2636,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2832,9 +2657,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2847,9 +2671,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2862,9 +2685,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 @@ -2878,9 +2700,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 @@ -2894,8 +2715,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2910,8 +2731,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2926,8 +2747,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -2945,8 +2766,8 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -2970,8 +2791,8 @@ entry: define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2991,8 +2812,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -3009,9 +2830,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3026,9 +2846,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3042,8 +2861,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -3063,9 +2882,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3078,9 +2896,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3093,9 +2910,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 @@ -3109,9 +2925,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 @@ -3125,8 +2940,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3141,8 +2956,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3157,8 +2972,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -3176,8 +2991,8 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -3201,6 +3016,7 @@ entry: define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3222,7 +3038,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3247,7 +3063,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3262,7 +3077,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3277,6 +3091,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3298,7 +3113,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3313,7 +3127,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3328,7 +3141,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3343,7 +3155,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3358,7 +3169,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3373,7 +3183,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3388,8 +3197,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3404,8 +3211,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3427,6 +3232,7 @@ entry: define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3450,7 +3256,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3477,7 +3283,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3495,7 +3300,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3513,6 +3317,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3535,7 +3340,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3552,7 +3356,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3569,7 +3372,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3586,7 +3388,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3603,7 +3404,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3621,7 +3421,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3639,8 +3438,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3657,8 +3454,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3682,6 +3477,7 @@ entry: define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3704,7 +3500,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3730,7 +3526,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3747,7 +3542,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3764,6 +3558,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3786,7 +3581,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3802,7 +3596,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3818,7 +3611,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3835,7 +3627,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3852,7 +3643,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3869,7 +3659,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3886,8 +3675,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3906,8 +3693,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3933,6 +3718,7 @@ entry: define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3957,7 +3743,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3985,7 +3771,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4005,7 +3790,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4025,6 +3809,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4048,7 +3833,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4066,7 +3850,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4084,7 +3867,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4103,7 +3885,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4122,7 +3903,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4142,7 +3922,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4162,8 +3941,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4184,8 +3961,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4213,6 +3988,7 @@ entry: define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4237,7 +4013,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4265,7 +4041,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4285,7 +4060,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4305,6 +4079,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4328,7 +4103,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4346,7 +4120,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4364,7 +4137,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4383,7 +4155,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4402,7 +4173,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4422,7 +4192,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4442,8 +4211,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4464,8 +4231,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4493,6 +4258,7 @@ entry: define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4516,7 +4282,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4543,7 +4309,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4561,7 +4326,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4579,6 +4343,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4601,7 +4366,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4618,7 +4382,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4635,7 +4398,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4652,7 +4414,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4669,7 +4430,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4687,7 +4447,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4705,8 +4464,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4723,8 +4480,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4748,6 +4503,7 @@ entry: define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4771,7 +4527,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4798,7 +4554,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4816,7 +4571,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4834,6 +4588,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4856,7 +4611,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4873,7 +4627,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4890,7 +4643,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4907,7 +4659,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4924,7 +4675,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4942,7 +4692,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4960,8 +4709,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4978,8 +4725,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5003,6 +4748,7 @@ entry: define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX6-LABEL: global_agent_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5027,7 +4773,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5055,7 +4801,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5075,7 +4820,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5095,6 +4839,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5118,7 +4863,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5136,7 +4880,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5154,7 +4897,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5173,7 +4915,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5192,7 +4933,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5212,7 +4952,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5232,8 +4971,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5254,8 +4991,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5283,6 +5018,7 @@ entry: define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5307,7 +5043,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5335,7 +5071,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5355,7 +5090,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5375,6 +5109,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5398,7 +5133,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5416,7 +5150,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5434,7 +5167,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5453,7 +5185,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5472,7 +5203,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5492,7 +5222,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5512,8 +5241,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5534,8 +5261,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5563,6 +5288,7 @@ entry: define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5587,7 +5313,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5615,7 +5341,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5635,7 +5360,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5655,6 +5379,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5678,7 +5403,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5696,7 +5420,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5714,7 +5437,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5733,7 +5455,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5752,7 +5473,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5772,7 +5492,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5792,8 +5511,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5814,8 +5531,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5843,6 +5558,7 @@ entry: define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5867,7 +5583,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5895,7 +5611,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5915,7 +5630,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5935,6 +5649,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5958,7 +5673,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5976,7 +5690,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5994,7 +5707,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6013,7 +5725,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6032,7 +5743,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6052,7 +5762,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6072,8 +5781,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6094,8 +5801,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6123,6 +5828,7 @@ entry: define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6147,7 +5853,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6175,7 +5881,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6195,7 +5900,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6215,6 +5919,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6238,7 +5943,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6256,7 +5960,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6274,7 +5977,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6293,7 +5995,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6312,7 +6013,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6332,7 +6032,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6352,8 +6051,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6374,8 +6071,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6403,6 +6098,7 @@ entry: define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6427,7 +6123,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6455,7 +6151,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6475,7 +6170,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6495,6 +6189,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6518,7 +6213,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6536,7 +6230,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6554,7 +6247,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6573,7 +6265,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6592,7 +6283,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6612,7 +6302,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6632,8 +6321,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6654,8 +6341,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6683,6 +6368,7 @@ entry: define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6707,7 +6393,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6735,7 +6421,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6755,7 +6440,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6775,6 +6459,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6798,7 +6483,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6816,7 +6500,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6834,7 +6517,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6853,7 +6535,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6872,7 +6553,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6892,7 +6572,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6912,8 +6591,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6934,8 +6611,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6963,6 +6638,7 @@ entry: define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6987,7 +6663,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -7015,7 +6691,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7035,7 +6710,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7055,6 +6729,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7078,7 +6753,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7096,7 +6770,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7114,7 +6787,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7133,7 +6805,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7152,7 +6823,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7172,7 +6842,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7192,8 +6861,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7214,8 +6881,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7243,6 +6908,7 @@ entry: define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7267,6 +6933,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7295,7 +6962,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7312,7 +6978,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7329,6 +6994,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7353,7 +7019,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7370,7 +7035,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7387,7 +7051,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7404,7 +7067,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7421,7 +7083,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7438,7 +7099,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7455,8 +7115,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7473,8 +7131,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7500,6 +7156,7 @@ entry: define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7525,6 +7182,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7554,7 +7212,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7573,7 +7230,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7592,6 +7248,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7617,7 +7274,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7635,7 +7291,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7653,7 +7308,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7671,7 +7325,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7689,7 +7342,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7708,7 +7360,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7727,8 +7378,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7746,8 +7395,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7774,6 +7421,7 @@ entry: define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7799,6 +7447,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7828,7 +7477,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7847,7 +7495,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7866,6 +7513,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7891,7 +7539,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7909,7 +7556,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7927,7 +7573,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7946,7 +7591,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7965,7 +7609,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7984,7 +7627,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8003,8 +7645,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8025,8 +7665,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8056,6 +7694,7 @@ entry: define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8082,6 +7721,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8112,7 +7752,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8133,7 +7772,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8154,6 +7792,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8180,7 +7819,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8199,7 +7837,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8218,7 +7855,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8238,7 +7874,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8258,7 +7893,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8279,7 +7913,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8300,8 +7933,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8325,8 +7956,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8359,6 +7988,7 @@ entry: define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8385,6 +8015,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8415,7 +8046,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8436,7 +8066,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8457,6 +8086,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8483,7 +8113,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8502,7 +8131,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8521,7 +8149,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8541,7 +8168,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8561,7 +8187,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8582,7 +8207,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8603,8 +8227,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8628,8 +8250,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8662,6 +8282,7 @@ entry: define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8687,6 +8308,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8716,7 +8338,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8735,7 +8356,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8754,6 +8374,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8779,7 +8400,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8797,7 +8417,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8815,7 +8434,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8833,7 +8451,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8851,7 +8468,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8870,7 +8486,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8889,8 +8504,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8910,8 +8523,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8940,6 +8551,7 @@ entry: define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8965,6 +8577,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8994,7 +8607,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9013,7 +8625,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9032,6 +8643,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9057,7 +8669,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9075,7 +8686,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9093,7 +8703,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9111,7 +8720,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9129,7 +8737,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9148,7 +8755,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9167,8 +8773,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9186,8 +8790,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9214,6 +8816,7 @@ entry: define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9240,6 +8843,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9270,7 +8874,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9291,7 +8894,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9312,6 +8914,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9338,7 +8941,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9357,7 +8959,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9376,7 +8977,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9396,7 +8996,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9416,7 +9015,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9437,7 +9035,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9458,8 +9055,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9483,8 +9078,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9517,6 +9110,7 @@ entry: define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9543,6 +9137,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9573,7 +9168,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9594,7 +9188,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9615,6 +9208,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9641,7 +9235,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9660,7 +9253,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9679,7 +9271,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9699,7 +9290,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9719,7 +9309,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9740,7 +9329,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9761,8 +9349,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9786,8 +9372,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9820,6 +9404,7 @@ entry: define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9846,6 +9431,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9876,7 +9462,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9897,7 +9482,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9918,6 +9502,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9944,7 +9529,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9963,7 +9547,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9982,7 +9565,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10002,7 +9584,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10022,7 +9603,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10043,7 +9623,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10064,8 +9643,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10089,8 +9666,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10123,6 +9698,7 @@ entry: define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10149,6 +9725,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10179,7 +9756,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10200,7 +9776,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10221,6 +9796,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10247,7 +9823,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10266,7 +9841,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10285,7 +9859,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10305,7 +9878,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10325,7 +9897,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10346,7 +9917,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10367,8 +9937,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10392,8 +9960,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10426,6 +9992,7 @@ entry: define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10452,6 +10019,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10482,7 +10050,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10503,7 +10070,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10524,6 +10090,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10550,7 +10117,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10569,7 +10135,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10588,7 +10153,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10608,7 +10172,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10628,7 +10191,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10649,7 +10211,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10670,8 +10231,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10693,8 +10252,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10725,6 +10282,7 @@ entry: define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10751,6 +10309,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10781,7 +10340,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10802,7 +10360,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10823,6 +10380,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10849,7 +10407,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10868,7 +10425,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10887,7 +10443,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10907,7 +10462,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10927,7 +10481,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10948,7 +10501,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10969,8 +10521,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10994,8 +10544,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11028,6 +10576,7 @@ entry: define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11054,6 +10603,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11084,7 +10634,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11105,7 +10654,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11126,6 +10674,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -11152,7 +10701,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11171,7 +10719,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11190,7 +10737,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -11210,7 +10756,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -11230,7 +10775,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11251,7 +10795,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11272,8 +10815,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11297,8 +10838,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11331,6 +10870,7 @@ entry: define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11357,6 +10897,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -11387,7 +10928,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11408,7 +10948,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11429,6 +10968,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -11455,7 +10995,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11474,7 +11013,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -11493,7 +11031,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -11513,7 +11050,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -11533,7 +11069,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11554,7 +11089,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11575,8 +11109,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11600,8 +11132,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11634,8 +11164,9 @@ entry: define amdgpu_kernel void @global_agent_one_as_unordered_load( ; GFX6-LABEL: global_agent_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -11658,9 +11189,8 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX7-LABEL: global_agent_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11673,11 +11203,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX10-WGP-LABEL: global_agent_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -11686,11 +11214,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX10-CU-LABEL: global_agent_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -11699,8 +11225,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11723,11 +11250,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11736,11 +11261,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11749,11 +11272,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11762,11 +11283,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11775,10 +11294,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX11-WGP-LABEL: global_agent_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -11787,10 +11305,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX11-CU-LABEL: global_agent_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -11799,11 +11316,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX12-WGP-LABEL: global_agent_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11812,11 +11327,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX12-CU-LABEL: global_agent_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11832,8 +11345,9 @@ entry: define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX6-LABEL: global_agent_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -11856,9 +11370,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX7-LABEL: global_agent_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11871,11 +11384,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -11884,11 +11395,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -11897,8 +11406,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11921,11 +11431,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11934,11 +11442,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11947,11 +11453,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11960,11 +11464,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11973,10 +11475,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -11985,10 +11486,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -11997,11 +11497,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12010,11 +11508,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12030,8 +11526,9 @@ entry: define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX6-LABEL: global_agent_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -12055,9 +11552,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX7-LABEL: global_agent_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12071,11 +11567,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -12086,11 +11580,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -12101,8 +11593,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12125,11 +11618,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12139,11 +11630,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12153,11 +11642,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12167,11 +11654,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12181,10 +11666,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -12195,10 +11679,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -12209,11 +11692,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12223,11 +11704,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12244,8 +11723,9 @@ entry: define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX6-LABEL: global_agent_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -12270,9 +11750,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12287,11 +11766,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc @@ -12303,11 +11780,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc @@ -12319,8 +11794,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12344,11 +11820,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12358,11 +11832,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12372,11 +11844,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12386,11 +11856,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12400,10 +11868,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc @@ -12415,10 +11882,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc @@ -12430,11 +11896,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -12450,11 +11914,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -12477,8 +11939,9 @@ entry: define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX6-LABEL: global_agent_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12494,8 +11957,8 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX7-LABEL: global_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12505,10 +11968,8 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX10-WGP-LABEL: global_agent_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -12517,10 +11978,8 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX10-CU-LABEL: global_agent_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -12529,8 +11988,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12546,10 +12006,8 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -12558,10 +12016,8 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -12570,10 +12026,8 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -12582,10 +12036,8 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -12594,9 +12046,8 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX11-WGP-LABEL: global_agent_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -12605,9 +12056,8 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX11-CU-LABEL: global_agent_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -12616,10 +12066,8 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX12-WGP-LABEL: global_agent_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -12628,10 +12076,8 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX12-CU-LABEL: global_agent_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -12646,8 +12092,9 @@ entry: define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX6-LABEL: global_agent_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12663,8 +12110,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX7-LABEL: global_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12674,10 +12121,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -12686,10 +12131,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -12698,8 +12141,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12715,10 +12159,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -12727,10 +12169,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -12739,10 +12179,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -12751,10 +12189,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -12763,9 +12199,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -12774,9 +12209,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -12785,10 +12219,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -12797,10 +12229,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -12815,8 +12245,9 @@ entry: define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX6-LABEL: global_agent_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12833,8 +12264,8 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX7-LABEL: global_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12845,10 +12276,8 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -12859,10 +12288,8 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX10-CU-LABEL: global_agent_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -12873,8 +12300,9 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12891,10 +12319,8 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -12904,10 +12330,8 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -12917,10 +12341,8 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -12931,10 +12353,8 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -12945,9 +12365,8 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX11-WGP-LABEL: global_agent_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -12958,9 +12377,8 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX11-CU-LABEL: global_agent_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -12971,10 +12389,8 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX12-WGP-LABEL: global_agent_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -12987,10 +12403,8 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX12-CU-LABEL: global_agent_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -13009,8 +12423,9 @@ entry: define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX6-LABEL: global_agent_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -13027,8 +12442,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13039,10 +12454,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -13053,10 +12466,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -13067,8 +12478,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -13085,10 +12497,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -13098,10 +12508,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -13111,10 +12519,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -13125,10 +12531,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -13139,9 +12543,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -13152,9 +12555,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -13165,10 +12567,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -13181,10 +12581,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -13203,8 +12601,8 @@ entry: define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -13220,9 +12618,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13233,9 +12630,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -13244,9 +12640,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX10-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -13254,8 +12649,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -13272,9 +12667,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -13283,9 +12677,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -13294,9 +12687,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -13305,9 +12697,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -13316,8 +12707,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -13326,8 +12717,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX11-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -13336,8 +12727,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV @@ -13346,8 +12737,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV @@ -13361,8 +12752,8 @@ entry: define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -13380,9 +12771,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13395,9 +12785,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX10-WGP-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -13409,9 +12798,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX10-CU-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -13422,8 +12810,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -13441,9 +12829,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -13454,9 +12841,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -13467,9 +12853,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -13480,9 +12865,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -13493,8 +12877,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX11-WGP-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -13506,8 +12890,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX11-CU-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -13519,8 +12903,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV @@ -13531,8 +12915,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV @@ -13548,8 +12932,8 @@ entry: define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX6-LABEL: global_agent_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -13566,9 +12950,8 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13580,9 +12963,8 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX10-WGP-LABEL: global_agent_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13593,9 +12975,8 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX10-CU-LABEL: global_agent_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -13605,8 +12986,8 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -13624,9 +13005,8 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13636,9 +13016,8 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13648,9 +13027,8 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 @@ -13661,9 +13039,8 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 @@ -13674,8 +13051,8 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX11-WGP-LABEL: global_agent_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13686,8 +13063,8 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX11-CU-LABEL: global_agent_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -13698,8 +13075,8 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: global_agent_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -13712,8 +13089,8 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX12-CU-LABEL: global_agent_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -13731,8 +13108,8 @@ entry: define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -13751,9 +13128,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13767,9 +13143,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13783,9 +13158,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -13798,8 +13172,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -13818,9 +13192,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13832,9 +13205,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13846,9 +13218,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 @@ -13861,9 +13232,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 @@ -13876,8 +13246,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13891,8 +13261,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -13906,8 +13276,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -13922,8 +13292,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -13943,8 +13313,8 @@ entry: define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -13963,9 +13333,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13979,9 +13348,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13995,9 +13363,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -14010,8 +13377,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -14030,9 +13397,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14044,9 +13410,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14058,9 +13423,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 @@ -14073,9 +13437,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 @@ -14088,8 +13451,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -14103,8 +13466,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -14118,8 +13481,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -14134,8 +13497,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -14155,8 +13518,8 @@ entry: define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -14175,8 +13538,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -14192,9 +13555,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -14207,9 +13569,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -14221,8 +13582,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -14241,9 +13602,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -14255,9 +13615,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -14269,9 +13628,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -14283,9 +13641,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -14297,8 +13654,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -14311,8 +13668,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -14325,8 +13682,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -14338,8 +13695,8 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -14357,8 +13714,8 @@ entry: define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -14378,8 +13735,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -14396,9 +13753,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -14413,9 +13769,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -14429,8 +13784,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -14450,9 +13805,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14465,9 +13819,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14480,9 +13833,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 @@ -14496,9 +13848,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 @@ -14512,8 +13863,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -14528,8 +13879,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -14544,8 +13895,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -14563,8 +13914,8 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -14588,8 +13939,8 @@ entry: define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -14609,8 +13960,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -14627,9 +13978,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -14644,9 +13994,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -14660,8 +14009,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -14681,9 +14030,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14696,9 +14044,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -14711,9 +14058,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 @@ -14727,9 +14073,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 @@ -14743,8 +14088,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -14759,8 +14104,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -14775,8 +14120,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -14794,8 +14139,8 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -14819,6 +14164,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14840,7 +14186,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14865,7 +14211,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14880,7 +14225,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14895,6 +14239,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14916,7 +14261,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14931,7 +14275,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14946,7 +14289,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14961,7 +14303,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14976,7 +14317,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14991,7 +14331,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15006,8 +14345,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15022,8 +14359,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15045,6 +14380,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15068,7 +14404,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15095,7 +14431,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15113,7 +14448,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15131,6 +14465,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15153,7 +14488,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15170,7 +14504,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15187,7 +14520,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15204,7 +14536,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15221,7 +14552,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15239,7 +14569,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15257,8 +14586,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15275,8 +14602,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15300,6 +14625,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15322,7 +14648,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15348,7 +14674,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15365,7 +14690,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15382,6 +14706,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15404,7 +14729,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15420,7 +14744,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15436,7 +14759,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15453,7 +14775,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15470,7 +14791,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15487,7 +14807,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15504,8 +14823,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15524,8 +14841,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15551,6 +14866,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15575,7 +14891,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15603,7 +14919,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15623,7 +14938,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15643,6 +14957,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15666,7 +14981,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15684,7 +14998,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15702,7 +15015,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15721,7 +15033,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15740,7 +15051,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15760,7 +15070,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15780,8 +15089,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15802,8 +15109,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15831,6 +15136,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15855,7 +15161,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15883,7 +15189,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15903,7 +15208,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15923,6 +15227,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15946,7 +15251,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15964,7 +15268,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15982,7 +15285,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16001,7 +15303,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16020,7 +15321,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16040,7 +15340,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16060,8 +15359,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16082,8 +15379,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16111,6 +15406,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16134,7 +15430,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16161,7 +15457,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16179,7 +15474,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16197,6 +15491,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16219,7 +15514,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16236,7 +15530,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16253,7 +15546,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16270,7 +15562,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16287,7 +15578,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16305,7 +15595,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16323,8 +15612,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16341,8 +15628,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16366,6 +15651,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16389,7 +15675,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16416,7 +15702,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16434,7 +15719,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16452,6 +15736,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16474,7 +15759,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16491,7 +15775,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16508,7 +15791,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16525,7 +15807,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16542,7 +15823,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16560,7 +15840,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16578,8 +15857,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16596,8 +15873,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16621,6 +15896,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16645,7 +15921,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16673,7 +15949,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16693,7 +15968,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16713,6 +15987,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16736,7 +16011,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16754,7 +16028,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16772,7 +16045,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16791,7 +16063,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16810,7 +16081,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16830,7 +16100,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16850,8 +16119,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16872,8 +16139,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16901,6 +16166,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16925,7 +16191,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16953,7 +16219,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16973,7 +16238,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16993,6 +16257,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17016,7 +16281,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17034,7 +16298,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17052,7 +16315,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17071,7 +16333,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17090,7 +16351,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17110,7 +16370,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17130,8 +16389,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17152,8 +16409,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17181,6 +16436,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17205,7 +16461,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17233,7 +16489,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17253,7 +16508,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17273,6 +16527,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17296,7 +16551,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17314,7 +16568,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17332,7 +16585,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17351,7 +16603,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17370,7 +16621,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17390,7 +16640,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17410,8 +16659,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17432,8 +16679,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17461,6 +16706,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17485,7 +16731,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17513,7 +16759,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17533,7 +16778,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17553,6 +16797,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17576,7 +16821,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17594,7 +16838,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17612,7 +16855,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17631,7 +16873,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17650,7 +16891,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17670,7 +16910,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17690,8 +16929,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17712,8 +16949,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17741,6 +16976,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17765,7 +17001,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17793,7 +17029,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17813,7 +17048,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17833,6 +17067,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17856,7 +17091,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17874,7 +17108,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17892,7 +17125,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17911,7 +17143,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17930,7 +17161,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17950,7 +17180,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17970,8 +17199,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17992,8 +17219,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18021,6 +17246,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18045,7 +17271,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -18073,7 +17299,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18093,7 +17318,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18113,6 +17337,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18136,7 +17361,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18154,7 +17378,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18172,7 +17395,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18191,7 +17413,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18210,7 +17431,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18230,7 +17450,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18250,8 +17469,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18272,8 +17489,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18301,6 +17516,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18325,7 +17541,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -18353,7 +17569,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18373,7 +17588,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18393,6 +17607,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18416,7 +17631,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18434,7 +17648,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18452,7 +17665,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18471,7 +17683,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18490,7 +17701,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18510,7 +17720,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18530,8 +17739,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18552,8 +17759,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18581,6 +17786,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18605,7 +17811,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -18633,7 +17839,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18653,7 +17858,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18673,6 +17877,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18696,7 +17901,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18714,7 +17918,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18732,7 +17935,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18751,7 +17953,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18770,7 +17971,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18790,7 +17990,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18810,8 +18009,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18832,8 +18029,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18861,6 +18056,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18885,6 +18081,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18913,7 +18110,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18930,7 +18126,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18947,6 +18142,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18971,7 +18167,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18988,7 +18183,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19005,7 +18199,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19022,7 +18215,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19039,7 +18231,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19056,7 +18247,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19073,8 +18263,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19091,8 +18279,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19118,6 +18304,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19143,6 +18330,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19172,7 +18360,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19191,7 +18378,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19210,6 +18396,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19235,7 +18422,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19253,7 +18439,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19271,7 +18456,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19289,7 +18473,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19307,7 +18490,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19326,7 +18508,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19345,8 +18526,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19364,8 +18543,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19392,6 +18569,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19418,6 +18596,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19448,7 +18627,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19469,7 +18647,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19490,6 +18667,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19516,7 +18694,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19535,7 +18712,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19554,7 +18730,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19574,7 +18749,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19594,7 +18768,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19615,7 +18788,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19636,8 +18808,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19661,8 +18831,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19695,6 +18863,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19721,6 +18890,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19751,7 +18921,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19772,7 +18941,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19793,6 +18961,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19819,7 +18988,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19838,7 +19006,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19857,7 +19024,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19877,7 +19043,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19897,7 +19062,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19918,7 +19082,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19939,8 +19102,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19964,8 +19125,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19998,6 +19157,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20023,6 +19183,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20052,7 +19213,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20071,7 +19231,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20090,6 +19249,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20115,7 +19275,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20133,7 +19292,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20151,7 +19309,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20169,7 +19326,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20187,7 +19343,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20206,7 +19361,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20225,8 +19379,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20246,8 +19398,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20276,6 +19426,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20301,6 +19452,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20330,7 +19482,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20349,7 +19500,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20368,6 +19518,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20393,7 +19544,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20411,7 +19561,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20429,7 +19578,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20447,7 +19595,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20465,7 +19612,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20484,7 +19630,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20503,8 +19648,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20522,8 +19665,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20550,6 +19691,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20576,6 +19718,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20606,7 +19749,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20627,7 +19769,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20648,6 +19789,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20674,7 +19816,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20693,7 +19834,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20712,7 +19852,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20732,7 +19871,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20752,7 +19890,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20773,7 +19910,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20794,8 +19930,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20819,8 +19953,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20853,6 +19985,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20879,6 +20012,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20909,7 +20043,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20930,7 +20063,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20951,6 +20083,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20977,7 +20110,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20996,7 +20128,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21015,7 +20146,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21035,7 +20165,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21055,7 +20184,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21076,7 +20204,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21097,8 +20224,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21122,8 +20247,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21156,6 +20279,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21182,6 +20306,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21212,7 +20337,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21233,7 +20357,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21254,6 +20377,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21280,7 +20404,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21299,7 +20422,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21318,7 +20440,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21338,7 +20459,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21358,7 +20478,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21379,7 +20498,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21400,8 +20518,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21425,8 +20541,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21459,6 +20573,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21485,6 +20600,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21515,7 +20631,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21536,7 +20651,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21557,6 +20671,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21583,7 +20698,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21602,7 +20716,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21621,7 +20734,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21641,7 +20753,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21661,7 +20772,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21682,7 +20792,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21703,8 +20812,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21728,8 +20835,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21762,6 +20867,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21788,6 +20894,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21818,7 +20925,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21839,7 +20945,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21860,6 +20965,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21886,7 +20992,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21905,7 +21010,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21924,7 +21028,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21944,7 +21047,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21964,7 +21066,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21985,7 +21086,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22006,8 +21106,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22029,8 +21127,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22061,6 +21157,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22087,6 +21184,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22117,7 +21215,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22138,7 +21235,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22159,6 +21255,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -22185,7 +21282,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22204,7 +21300,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22223,7 +21318,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -22243,7 +21337,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -22263,7 +21356,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22284,7 +21376,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22305,8 +21396,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22330,8 +21419,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22364,6 +21451,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22390,6 +21478,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22420,7 +21509,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22441,7 +21529,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22462,6 +21549,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -22488,7 +21576,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22507,7 +21594,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22526,7 +21612,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -22546,7 +21631,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -22566,7 +21650,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22587,7 +21670,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22608,8 +21690,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22633,8 +21713,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22667,6 +21745,7 @@ entry: define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22693,6 +21772,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -22723,7 +21803,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22744,7 +21823,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22765,6 +21843,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -22791,7 +21870,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22810,7 +21888,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22829,7 +21906,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -22849,7 +21925,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -22869,7 +21944,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22890,7 +21964,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22911,8 +21984,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22936,8 +22007,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index 0fc3212b0f46d9..465626078f6c68 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -5,10 +5,8 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_load_0: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -25,11 +23,9 @@ entry: define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_load_1: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s4, 0x3ff ; GFX12-NEXT: s_wait_alu 0xfffe @@ -53,11 +49,9 @@ entry: define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_and_volatile_load: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX12-NEXT: s_wait_bvhcnt 0x0 @@ -74,11 +68,9 @@ entry: define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_and_nontemporal_load: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s4, 0x3ff ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index 14f1734235673a..f06118a7a6dc9b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -16,8 +16,9 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; GFX6-LABEL: global_nontemporal_load_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -35,8 +36,8 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX7-LABEL: global_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -48,10 +49,8 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX10-WGP-LABEL: global_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0 @@ -62,10 +61,8 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX10-CU-LABEL: global_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0 @@ -76,8 +73,9 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; SKIP-CACHE-INV-LABEL: global_nontemporal_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -95,10 +93,8 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0 @@ -109,10 +105,8 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: global_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0 @@ -123,10 +117,8 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_load_0: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -137,10 +129,8 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX940-TGSPLIT-LABEL: global_nontemporal_load_0: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -151,9 +141,8 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX11-WGP-LABEL: global_nontemporal_load_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -164,9 +153,8 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX11-CU-LABEL: global_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -177,10 +165,8 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX12-WGP-LABEL: global_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -191,10 +177,8 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX12-CU-LABEL: global_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -212,8 +196,9 @@ entry: define amdgpu_kernel void @global_nontemporal_load_1( ; GFX6-LABEL: global_nontemporal_load_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -242,8 +227,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX7-LABEL: global_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -270,11 +256,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX10-WGP-LABEL: global_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_mov_b32 s8, 2 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s8, v1 @@ -286,11 +270,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX10-CU-LABEL: global_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_mov_b32 s8, 2 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s8, v1 @@ -302,8 +284,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; SKIP-CACHE-INV-LABEL: global_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -332,11 +315,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s8, 0x3ff ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s8 @@ -350,11 +331,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX90A-TGSPLIT-LABEL: global_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s8, 0x3ff ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s8 @@ -368,11 +347,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_load_1: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0x3ff ; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s4 @@ -386,11 +363,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX940-TGSPLIT-LABEL: global_nontemporal_load_1: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s4, 0x3ff ; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s4 @@ -404,10 +379,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX11-WGP-LABEL: global_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_mov_b32 s4, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s4 @@ -421,10 +395,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX11-CU-LABEL: global_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_mov_b32 s4, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s4 @@ -438,11 +411,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX12-WGP-LABEL: global_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -458,11 +429,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX12-CU-LABEL: global_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -487,8 +456,9 @@ entry: define amdgpu_kernel void @global_nontemporal_store_0( ; GFX6-LABEL: global_nontemporal_store_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -506,8 +476,8 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX7-LABEL: global_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -519,10 +489,8 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX10-WGP-LABEL: global_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0 @@ -533,10 +501,8 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX10-CU-LABEL: global_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0 @@ -547,8 +513,9 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; SKIP-CACHE-INV-LABEL: global_nontemporal_store_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -566,10 +533,8 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0 @@ -580,10 +545,8 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: global_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0 @@ -594,10 +557,8 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_store_0: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -608,10 +569,8 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX940-TGSPLIT-LABEL: global_nontemporal_store_0: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -622,9 +581,8 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX11-WGP-LABEL: global_nontemporal_store_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -635,9 +593,8 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX11-CU-LABEL: global_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -648,10 +605,8 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX12-WGP-LABEL: global_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -662,10 +617,8 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX12-CU-LABEL: global_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -683,9 +636,8 @@ entry: define amdgpu_kernel void @global_nontemporal_store_1( ; GFX6-LABEL: global_nontemporal_store_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX6-NEXT: s_mov_b32 s6, 0x100f000 @@ -708,8 +660,8 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX7-LABEL: global_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s5, 2 @@ -735,10 +687,8 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX10-WGP-LABEL: global_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_mov_b32 s7, 2 @@ -750,10 +700,8 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX10-CU-LABEL: global_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX10-CU-NEXT: s_mov_b32 s7, 2 @@ -765,9 +713,8 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; SKIP-CACHE-INV-LABEL: global_nontemporal_store_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0xf000 @@ -790,10 +737,8 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 0x3ff @@ -807,10 +752,8 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: global_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 0x3ff @@ -824,10 +767,8 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_store_1: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s3, 0x3ff @@ -841,10 +782,8 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX940-TGSPLIT-LABEL: global_nontemporal_store_1: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s3, 0x3ff @@ -858,9 +797,8 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX11-WGP-LABEL: global_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff @@ -874,9 +812,8 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX11-CU-LABEL: global_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff @@ -890,10 +827,8 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX12-WGP-LABEL: global_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff @@ -909,10 +844,8 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX12-CU-LABEL: global_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff @@ -937,8 +870,9 @@ entry: define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX6-LABEL: global_nontemporal_volatile_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -961,9 +895,8 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX7-LABEL: global_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -976,11 +909,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: global_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -989,11 +920,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: global_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -1002,8 +931,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; SKIP-CACHE-INV-LABEL: global_nontemporal_volatile_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1026,11 +956,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -1039,11 +967,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: global_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -1052,11 +978,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -1065,11 +989,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX940-TGSPLIT-LABEL: global_nontemporal_volatile_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -1078,10 +1000,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX11-WGP-LABEL: global_nontemporal_volatile_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -1090,10 +1011,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX11-CU-LABEL: global_nontemporal_volatile_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -1102,11 +1022,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX12-WGP-LABEL: global_nontemporal_volatile_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -1117,11 +1035,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX12-CU-LABEL: global_nontemporal_volatile_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 33aaeebf658dd6..ad08dc1777f64f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -16,8 +16,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX6-LABEL: global_singlethread_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -40,9 +41,8 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX7-LABEL: global_singlethread_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -55,11 +55,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX10-WGP-LABEL: global_singlethread_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -68,11 +66,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX10-CU-LABEL: global_singlethread_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -81,8 +77,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -105,11 +102,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -118,11 +113,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -131,11 +124,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -144,11 +135,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -157,10 +146,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX11-WGP-LABEL: global_singlethread_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -169,10 +157,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX11-CU-LABEL: global_singlethread_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -181,11 +168,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX12-WGP-LABEL: global_singlethread_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -194,11 +179,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX12-CU-LABEL: global_singlethread_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -214,8 +197,9 @@ entry: define amdgpu_kernel void @global_singlethread_monotonic_load( ; GFX6-LABEL: global_singlethread_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -238,9 +222,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX7-LABEL: global_singlethread_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -253,11 +236,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -266,11 +247,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -279,8 +258,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -303,11 +283,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -316,11 +294,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -329,11 +305,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -342,11 +316,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -355,10 +327,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -367,10 +338,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -379,11 +349,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -392,11 +360,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -412,8 +378,9 @@ entry: define amdgpu_kernel void @global_singlethread_acquire_load( ; GFX6-LABEL: global_singlethread_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -436,9 +403,8 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX7-LABEL: global_singlethread_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -451,11 +417,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -464,11 +428,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX10-CU-LABEL: global_singlethread_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -477,8 +439,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -501,11 +464,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -514,11 +475,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -527,11 +486,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -540,11 +497,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -553,10 +508,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -565,10 +519,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX11-CU-LABEL: global_singlethread_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -577,11 +530,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -590,11 +541,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX12-CU-LABEL: global_singlethread_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -610,8 +559,9 @@ entry: define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX6-LABEL: global_singlethread_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -634,9 +584,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX7-LABEL: global_singlethread_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -649,11 +598,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -662,11 +609,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -675,8 +620,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -699,11 +645,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -712,11 +656,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -725,11 +667,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -738,11 +678,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -751,10 +689,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -763,10 +700,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -775,11 +711,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -788,11 +722,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -808,8 +740,9 @@ entry: define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX6-LABEL: global_singlethread_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -825,8 +758,8 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX7-LABEL: global_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -836,10 +769,8 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX10-WGP-LABEL: global_singlethread_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -848,10 +779,8 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX10-CU-LABEL: global_singlethread_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -860,8 +789,9 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -877,10 +807,8 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -889,10 +817,8 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -901,10 +827,8 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -913,10 +837,8 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -925,9 +847,8 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX11-WGP-LABEL: global_singlethread_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -936,9 +857,8 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX11-CU-LABEL: global_singlethread_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -947,10 +867,8 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX12-WGP-LABEL: global_singlethread_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -959,10 +877,8 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX12-CU-LABEL: global_singlethread_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -977,8 +893,9 @@ entry: define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX6-LABEL: global_singlethread_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -994,8 +911,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX7-LABEL: global_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1005,10 +922,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1017,10 +932,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1029,8 +942,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1046,10 +960,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1058,10 +970,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1070,10 +980,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1082,10 +990,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1094,9 +1000,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1105,9 +1010,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1116,10 +1020,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1128,10 +1030,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1146,8 +1046,9 @@ entry: define amdgpu_kernel void @global_singlethread_release_store( ; GFX6-LABEL: global_singlethread_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1163,8 +1064,8 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX7-LABEL: global_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1174,10 +1075,8 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX10-WGP-LABEL: global_singlethread_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1186,10 +1085,8 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX10-CU-LABEL: global_singlethread_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1198,8 +1095,9 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1215,10 +1113,8 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1227,10 +1123,8 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1239,10 +1133,8 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1251,10 +1143,8 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1263,9 +1153,8 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX11-WGP-LABEL: global_singlethread_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1274,9 +1163,8 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX11-CU-LABEL: global_singlethread_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1285,10 +1173,8 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX12-WGP-LABEL: global_singlethread_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1297,10 +1183,8 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX12-CU-LABEL: global_singlethread_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1315,8 +1199,9 @@ entry: define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX6-LABEL: global_singlethread_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1332,8 +1217,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX7-LABEL: global_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1343,10 +1228,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1355,10 +1238,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1367,8 +1248,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1384,10 +1266,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1396,10 +1276,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1408,10 +1286,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1420,10 +1296,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1432,9 +1306,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1443,9 +1316,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1454,10 +1326,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1466,10 +1336,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1484,8 +1352,8 @@ entry: define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX6-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1501,9 +1369,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX7-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1514,9 +1381,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1525,9 +1391,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1535,8 +1400,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1553,9 +1418,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1564,9 +1428,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1575,9 +1438,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1586,9 +1448,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1597,8 +1458,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1607,8 +1468,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1617,8 +1478,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1627,8 +1488,8 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1642,8 +1503,8 @@ entry: define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX6-LABEL: global_singlethread_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1659,9 +1520,8 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1672,9 +1532,8 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1683,9 +1542,8 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1693,8 +1551,8 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1711,9 +1569,8 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1722,9 +1579,8 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1733,9 +1589,8 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1744,9 +1599,8 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1755,8 +1609,8 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1765,8 +1619,8 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1775,8 +1629,8 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1785,8 +1639,8 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1800,8 +1654,8 @@ entry: define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX6-LABEL: global_singlethread_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1817,9 +1671,8 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX7-LABEL: global_singlethread_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1830,9 +1683,8 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1841,9 +1693,8 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1851,8 +1702,8 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1869,9 +1720,8 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1880,9 +1730,8 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1891,9 +1740,8 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1902,9 +1750,8 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1913,8 +1760,8 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1923,8 +1770,8 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1933,8 +1780,8 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1943,8 +1790,8 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1958,8 +1805,8 @@ entry: define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX6-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1975,9 +1822,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1988,9 +1834,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1999,9 +1844,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2009,8 +1853,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2027,9 +1871,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2038,9 +1881,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2049,9 +1891,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -2060,9 +1901,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -2071,8 +1911,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2081,8 +1921,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2091,8 +1931,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2101,8 +1941,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2116,8 +1956,8 @@ entry: define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX6-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2133,9 +1973,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -2146,9 +1985,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2157,9 +1995,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2167,8 +2004,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2185,9 +2022,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2196,9 +2032,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2207,9 +2042,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -2218,9 +2052,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -2229,8 +2062,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2239,8 +2072,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2249,8 +2082,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2259,8 +2092,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2274,8 +2107,8 @@ entry: define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2293,8 +2126,8 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2309,9 +2142,8 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2322,9 +2154,8 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2334,8 +2165,8 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2354,9 +2185,8 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2367,9 +2197,8 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2380,9 +2209,8 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2393,9 +2221,8 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2406,8 +2233,8 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2418,8 +2245,8 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2430,8 +2257,8 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2442,8 +2269,8 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2460,8 +2287,8 @@ entry: define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2479,8 +2306,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2495,9 +2322,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2508,9 +2334,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2520,8 +2345,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2540,9 +2365,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2553,9 +2377,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2566,9 +2389,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2579,9 +2401,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2592,8 +2413,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2604,8 +2425,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2616,8 +2437,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2628,8 +2449,8 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2646,8 +2467,8 @@ entry: define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2665,8 +2486,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2681,9 +2502,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2694,9 +2514,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2706,8 +2525,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2726,9 +2545,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2739,9 +2557,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2752,9 +2569,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2765,9 +2581,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2778,8 +2593,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2790,8 +2605,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2802,8 +2617,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2814,8 +2629,8 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2832,6 +2647,7 @@ entry: define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -2853,7 +2669,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -2878,7 +2694,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2893,7 +2708,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2908,6 +2722,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -2929,7 +2744,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2944,7 +2758,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2959,7 +2772,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -2974,7 +2786,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -2989,7 +2800,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3004,7 +2814,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3019,8 +2828,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3035,8 +2842,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3058,6 +2863,7 @@ entry: define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3079,7 +2885,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3104,7 +2910,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3119,7 +2924,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3134,6 +2938,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3155,7 +2960,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3170,7 +2974,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3185,7 +2988,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3200,7 +3002,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3215,7 +3016,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3230,7 +3030,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3245,8 +3044,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3261,8 +3058,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3284,6 +3079,7 @@ entry: define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3305,7 +3101,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3330,7 +3126,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3345,7 +3140,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3360,6 +3154,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3381,7 +3176,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3396,7 +3190,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3411,7 +3204,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3426,7 +3218,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3441,7 +3232,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3456,7 +3246,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3471,8 +3260,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3487,8 +3274,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3510,6 +3295,7 @@ entry: define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3531,7 +3317,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3556,7 +3342,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3571,7 +3356,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3586,6 +3370,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3607,7 +3392,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3622,7 +3406,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3637,7 +3420,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3652,7 +3434,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3667,7 +3448,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3682,7 +3462,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3697,8 +3476,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3713,8 +3490,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3736,6 +3511,7 @@ entry: define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3757,7 +3533,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3782,7 +3558,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3797,7 +3572,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3812,6 +3586,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3833,7 +3608,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3848,7 +3622,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3863,7 +3636,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3878,7 +3650,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3893,7 +3664,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3908,7 +3678,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3923,8 +3692,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3939,8 +3706,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3962,6 +3727,7 @@ entry: define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3983,7 +3749,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4008,7 +3774,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4023,7 +3788,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4038,6 +3802,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4059,7 +3824,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4074,7 +3838,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4089,7 +3852,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4104,7 +3866,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4119,7 +3880,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4134,7 +3894,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4149,8 +3908,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4165,8 +3922,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4188,6 +3943,7 @@ entry: define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4209,7 +3965,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4234,7 +3990,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4249,7 +4004,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4264,6 +4018,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4285,7 +4040,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4300,7 +4054,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4315,7 +4068,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4330,7 +4082,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4345,7 +4096,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4360,7 +4110,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4375,8 +4124,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4391,8 +4138,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4414,6 +4159,7 @@ entry: define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4435,7 +4181,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4460,7 +4206,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4475,7 +4220,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4490,6 +4234,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4511,7 +4256,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4526,7 +4270,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4541,7 +4284,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4556,7 +4298,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4571,7 +4312,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4586,7 +4326,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4601,8 +4340,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4617,8 +4354,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4640,6 +4375,7 @@ entry: define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4661,7 +4397,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4686,7 +4422,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4701,7 +4436,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4716,6 +4450,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4737,7 +4472,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4752,7 +4486,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4767,7 +4500,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4782,7 +4514,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4797,7 +4528,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4812,7 +4542,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4827,8 +4556,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4843,8 +4570,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4866,6 +4591,7 @@ entry: define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4887,7 +4613,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4912,7 +4638,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4927,7 +4652,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4942,6 +4666,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4963,7 +4688,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4978,7 +4702,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4993,7 +4716,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5008,7 +4730,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5023,7 +4744,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5038,7 +4758,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5053,8 +4772,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5069,8 +4786,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5092,6 +4807,7 @@ entry: define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5113,7 +4829,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5138,7 +4854,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5153,7 +4868,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5168,6 +4882,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5189,7 +4904,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5204,7 +4918,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5219,7 +4932,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5234,7 +4946,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5249,7 +4960,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5264,7 +4974,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5279,8 +4988,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5295,8 +5002,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5318,6 +5023,7 @@ entry: define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5339,7 +5045,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5364,7 +5070,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5379,7 +5084,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5394,6 +5098,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5415,7 +5120,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5430,7 +5134,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5445,7 +5148,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5460,7 +5162,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5475,7 +5176,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5490,7 +5190,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5505,8 +5204,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5521,8 +5218,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5544,6 +5239,7 @@ entry: define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5565,7 +5261,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5590,7 +5286,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5605,7 +5300,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5620,6 +5314,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5641,7 +5336,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5656,7 +5350,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5671,7 +5364,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5686,7 +5378,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5701,7 +5392,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5716,7 +5406,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5731,8 +5420,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5747,8 +5434,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5770,6 +5455,7 @@ entry: define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5791,7 +5477,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5816,7 +5502,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5831,7 +5516,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5846,6 +5530,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5867,7 +5552,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5882,7 +5566,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5897,7 +5580,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5912,7 +5594,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5927,7 +5608,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5942,7 +5622,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5957,8 +5636,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5973,8 +5650,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5996,6 +5671,7 @@ entry: define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6017,7 +5693,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6042,7 +5718,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6057,7 +5732,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6072,6 +5746,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6093,7 +5768,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6108,7 +5782,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6123,7 +5796,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6138,7 +5810,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6153,7 +5824,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6168,7 +5838,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6183,8 +5852,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6199,8 +5866,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6222,6 +5887,7 @@ entry: define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6246,6 +5912,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6274,7 +5941,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6291,7 +5957,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6308,6 +5973,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6332,7 +5998,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6349,7 +6014,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6366,7 +6030,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6383,7 +6046,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6400,7 +6062,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6417,7 +6078,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6434,8 +6094,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6452,8 +6110,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6479,6 +6135,7 @@ entry: define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6503,6 +6160,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6531,7 +6189,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6548,7 +6205,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6565,6 +6221,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6589,7 +6246,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6606,7 +6262,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6623,7 +6278,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6640,7 +6294,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6657,7 +6310,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6674,7 +6326,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6691,8 +6342,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6709,8 +6358,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6736,6 +6383,7 @@ entry: define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6760,6 +6408,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6788,7 +6437,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6805,7 +6453,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6822,6 +6469,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6846,7 +6494,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6863,7 +6510,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6880,7 +6526,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6897,7 +6542,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6914,7 +6558,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6931,7 +6574,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6948,8 +6590,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6966,8 +6606,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6993,6 +6631,7 @@ entry: define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7017,6 +6656,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7045,7 +6685,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7062,7 +6701,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7079,6 +6717,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7103,7 +6742,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7120,7 +6758,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7137,7 +6774,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7154,7 +6790,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7171,7 +6806,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7188,7 +6822,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7205,8 +6838,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7223,8 +6854,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7250,6 +6879,7 @@ entry: define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7274,6 +6904,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7302,7 +6933,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7319,7 +6949,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7336,6 +6965,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7360,7 +6990,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7377,7 +7006,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7394,7 +7022,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7411,7 +7038,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7428,7 +7054,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7445,7 +7070,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7462,8 +7086,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7480,8 +7102,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7507,6 +7127,7 @@ entry: define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7531,6 +7152,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7559,7 +7181,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7576,7 +7197,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7593,6 +7213,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7617,7 +7238,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7634,7 +7254,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7651,7 +7270,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7668,7 +7286,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7685,7 +7302,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7702,7 +7318,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7719,8 +7334,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7737,8 +7350,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7764,6 +7375,7 @@ entry: define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7788,6 +7400,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7816,7 +7429,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7833,7 +7445,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7850,6 +7461,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7874,7 +7486,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7891,7 +7502,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7908,7 +7518,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7925,7 +7534,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7942,7 +7550,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7959,7 +7566,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7976,8 +7582,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7994,8 +7598,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8021,6 +7623,7 @@ entry: define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8045,6 +7648,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8073,7 +7677,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8090,7 +7693,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8107,6 +7709,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8131,7 +7734,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8148,7 +7750,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8165,7 +7766,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8182,7 +7782,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8199,7 +7798,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8216,7 +7814,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8233,8 +7830,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8251,8 +7846,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8278,6 +7871,7 @@ entry: define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8302,6 +7896,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8330,7 +7925,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8347,7 +7941,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8364,6 +7957,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8388,7 +7982,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8405,7 +7998,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8422,7 +8014,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8439,7 +8030,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8456,7 +8046,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8473,7 +8062,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8490,8 +8078,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8508,8 +8094,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8535,6 +8119,7 @@ entry: define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8559,6 +8144,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8587,7 +8173,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8604,7 +8189,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8621,6 +8205,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8645,7 +8230,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8662,7 +8246,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8679,7 +8262,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8696,7 +8278,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8713,7 +8294,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8730,7 +8310,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8747,8 +8326,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8765,8 +8342,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8792,6 +8367,7 @@ entry: define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8816,6 +8392,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8844,7 +8421,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8861,7 +8437,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8878,6 +8453,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8902,7 +8478,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8919,7 +8494,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8936,7 +8510,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8953,7 +8526,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8970,7 +8542,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8987,7 +8558,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9004,8 +8574,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9022,8 +8590,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9049,6 +8615,7 @@ entry: define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9073,6 +8640,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9101,7 +8669,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9118,7 +8685,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9135,6 +8701,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9159,7 +8726,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9176,7 +8742,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9193,7 +8758,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9210,7 +8774,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9227,7 +8790,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9244,7 +8806,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9261,8 +8822,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9279,8 +8838,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9306,6 +8863,7 @@ entry: define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9330,6 +8888,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9358,7 +8917,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9375,7 +8933,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9392,6 +8949,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9416,7 +8974,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9433,7 +8990,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9450,7 +9006,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9467,7 +9022,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9484,7 +9038,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9501,7 +9054,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9518,8 +9070,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9536,8 +9086,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9563,6 +9111,7 @@ entry: define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9587,6 +9136,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9615,7 +9165,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9632,7 +9181,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9649,6 +9197,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9673,7 +9222,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9690,7 +9238,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9707,7 +9254,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9724,7 +9270,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9741,7 +9286,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9758,7 +9302,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9775,8 +9318,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9793,8 +9334,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9820,6 +9359,7 @@ entry: define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9844,6 +9384,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9872,7 +9413,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9889,7 +9429,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9906,6 +9445,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9930,7 +9470,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9947,7 +9486,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9964,7 +9502,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9981,7 +9518,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9998,7 +9534,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10015,7 +9550,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10032,8 +9566,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10050,8 +9582,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10077,8 +9607,9 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; GFX6-LABEL: global_singlethread_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -10101,9 +9632,8 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX7-LABEL: global_singlethread_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10116,11 +9646,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10129,11 +9657,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX10-CU-LABEL: global_singlethread_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -10142,8 +9668,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10166,11 +9693,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10179,11 +9704,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10192,11 +9715,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10205,11 +9726,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10218,10 +9737,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10230,10 +9748,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX11-CU-LABEL: global_singlethread_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -10242,11 +9759,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10255,11 +9770,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX12-CU-LABEL: global_singlethread_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10275,8 +9788,9 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; GFX6-LABEL: global_singlethread_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -10299,9 +9813,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10314,11 +9827,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10327,11 +9838,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -10340,8 +9849,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10364,11 +9874,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10377,11 +9885,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10390,11 +9896,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10403,11 +9907,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10416,10 +9918,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10428,10 +9929,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -10440,11 +9940,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10453,11 +9951,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10473,8 +9969,9 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; GFX6-LABEL: global_singlethread_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -10497,9 +9994,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10512,11 +10008,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10525,11 +10019,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -10538,8 +10030,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10562,11 +10055,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10575,11 +10066,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10588,11 +10077,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10601,11 +10088,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10614,10 +10099,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10626,10 +10110,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -10638,11 +10121,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10651,11 +10132,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10671,8 +10150,9 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -10695,9 +10175,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10710,11 +10189,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10723,11 +10200,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -10736,8 +10211,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10760,11 +10236,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10773,11 +10247,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10786,11 +10258,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10799,11 +10269,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10812,10 +10280,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10824,10 +10291,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -10836,11 +10302,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10849,11 +10313,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10869,8 +10331,9 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX6-LABEL: global_singlethread_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10886,8 +10349,8 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX7-LABEL: global_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10897,10 +10360,8 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -10909,10 +10370,8 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX10-CU-LABEL: global_singlethread_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -10921,8 +10380,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -10938,10 +10398,8 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -10950,10 +10408,8 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -10962,10 +10418,8 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -10974,10 +10428,8 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -10986,9 +10438,8 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -10997,9 +10448,8 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX11-CU-LABEL: global_singlethread_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11008,10 +10458,8 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11020,10 +10468,8 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX12-CU-LABEL: global_singlethread_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11038,8 +10484,9 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX6-LABEL: global_singlethread_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11055,8 +10502,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11066,10 +10513,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -11078,10 +10523,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -11090,8 +10533,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11107,10 +10551,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11119,10 +10561,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11131,10 +10571,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11143,10 +10581,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11155,9 +10591,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11166,9 +10601,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11177,10 +10611,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11189,10 +10621,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11207,8 +10637,9 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX6-LABEL: global_singlethread_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11224,8 +10655,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX7-LABEL: global_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11235,10 +10666,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -11247,10 +10676,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -11259,8 +10686,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11276,10 +10704,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11288,10 +10714,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11300,10 +10724,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11312,10 +10734,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11324,9 +10744,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11335,9 +10754,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11346,10 +10764,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11358,10 +10774,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11376,8 +10790,9 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11393,8 +10808,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11404,10 +10819,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -11416,10 +10829,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -11428,8 +10839,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11445,10 +10857,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11457,10 +10867,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11469,10 +10877,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11481,10 +10887,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11493,9 +10897,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11504,9 +10907,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11515,10 +10917,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11527,10 +10927,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11545,8 +10943,8 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11562,9 +10960,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11575,9 +10972,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11586,9 +10982,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11596,8 +10991,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11614,9 +11009,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11625,9 +11019,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11636,9 +11029,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -11647,9 +11039,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -11658,8 +11049,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11668,8 +11059,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11678,8 +11069,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11688,8 +11079,8 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11703,8 +11094,8 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11720,9 +11111,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11733,9 +11123,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11744,9 +11133,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11754,8 +11142,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11772,9 +11160,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11783,9 +11170,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11794,9 +11180,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -11805,9 +11190,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -11816,8 +11200,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11826,8 +11210,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11836,8 +11220,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11846,8 +11230,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11861,8 +11245,8 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11878,9 +11262,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11891,9 +11274,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11902,9 +11284,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11912,8 +11293,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11930,9 +11311,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11941,9 +11321,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11952,9 +11331,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -11963,9 +11341,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -11974,8 +11351,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11984,8 +11361,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11994,8 +11371,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12004,8 +11381,8 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12019,8 +11396,8 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12036,9 +11413,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12049,9 +11425,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12060,9 +11435,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12070,8 +11444,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12088,9 +11462,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12099,9 +11472,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12110,9 +11482,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12121,9 +11492,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12132,8 +11502,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12142,8 +11512,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12152,8 +11522,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12162,8 +11532,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12177,8 +11547,8 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12194,9 +11564,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12207,9 +11576,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12218,9 +11586,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12228,8 +11595,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12246,9 +11613,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12257,9 +11623,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12268,9 +11633,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12279,9 +11643,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12290,8 +11653,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12300,8 +11663,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12310,8 +11673,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12320,8 +11683,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12335,8 +11698,8 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12354,8 +11717,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12370,9 +11733,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12383,9 +11745,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12395,8 +11756,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12415,9 +11776,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12428,9 +11788,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12441,9 +11800,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -12454,9 +11812,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -12467,8 +11824,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -12479,8 +11836,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -12491,8 +11848,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -12503,8 +11860,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -12521,8 +11878,8 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12540,8 +11897,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12556,9 +11913,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12569,9 +11925,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12581,8 +11936,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12601,9 +11956,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12614,9 +11968,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12627,9 +11980,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -12640,9 +11992,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -12653,8 +12004,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -12665,8 +12016,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -12677,8 +12028,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -12689,8 +12040,8 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -12707,8 +12058,8 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12726,8 +12077,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12742,9 +12093,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12755,9 +12105,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12767,8 +12116,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12787,9 +12136,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12800,9 +12148,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12813,9 +12160,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -12826,9 +12172,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -12839,8 +12184,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -12851,8 +12196,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -12863,8 +12208,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -12875,8 +12220,8 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -12893,6 +12238,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -12914,7 +12260,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -12939,7 +12285,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12954,7 +12299,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12969,6 +12313,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -12990,7 +12335,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13005,7 +12349,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13020,7 +12363,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13035,7 +12377,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13050,7 +12391,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13065,7 +12405,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13080,8 +12419,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13096,8 +12433,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13119,6 +12454,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -13140,7 +12476,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13165,7 +12501,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13180,7 +12515,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13195,6 +12529,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -13216,7 +12551,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13231,7 +12565,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13246,7 +12579,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13261,7 +12593,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13276,7 +12607,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13291,7 +12621,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13306,8 +12635,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13322,8 +12649,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13345,6 +12670,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -13366,7 +12692,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13391,7 +12717,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13406,7 +12731,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13421,6 +12745,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -13442,7 +12767,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13457,7 +12781,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13472,7 +12795,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13487,7 +12809,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13502,7 +12823,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13517,7 +12837,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13532,8 +12851,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13548,8 +12865,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13571,6 +12886,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -13592,7 +12908,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13617,7 +12933,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13632,7 +12947,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13647,6 +12961,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -13668,7 +12983,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13683,7 +12997,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13698,7 +13011,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13713,7 +13025,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13728,7 +13039,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13743,7 +13053,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13758,8 +13067,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13774,8 +13081,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13797,6 +13102,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -13818,7 +13124,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13843,7 +13149,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13858,7 +13163,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13873,6 +13177,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -13894,7 +13199,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13909,7 +13213,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13924,7 +13227,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13939,7 +13241,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13954,7 +13255,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13969,7 +13269,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13984,8 +13283,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14000,8 +13297,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14023,6 +13318,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14044,7 +13340,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14069,7 +13365,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14084,7 +13379,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14099,6 +13393,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14120,7 +13415,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14135,7 +13429,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14150,7 +13443,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14165,7 +13457,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14180,7 +13471,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14195,7 +13485,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14210,8 +13499,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14226,8 +13513,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14249,6 +13534,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14270,7 +13556,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14295,7 +13581,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14310,7 +13595,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14325,6 +13609,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14346,7 +13631,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14361,7 +13645,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14376,7 +13659,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14391,7 +13673,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14406,7 +13687,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14421,7 +13701,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14436,8 +13715,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14452,8 +13729,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14475,6 +13750,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14496,7 +13772,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14521,7 +13797,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14536,7 +13811,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14551,6 +13825,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14572,7 +13847,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14587,7 +13861,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14602,7 +13875,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14617,7 +13889,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14632,7 +13903,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14647,7 +13917,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14662,8 +13931,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14678,8 +13945,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14701,6 +13966,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14722,7 +13988,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14747,7 +14013,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14762,7 +14027,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14777,6 +14041,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14798,7 +14063,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14813,7 +14077,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14828,7 +14091,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14843,7 +14105,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14858,7 +14119,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14873,7 +14133,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14888,8 +14147,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14904,8 +14161,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14927,6 +14182,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14948,7 +14204,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14973,7 +14229,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14988,7 +14243,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15003,6 +14257,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15024,7 +14279,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15039,7 +14293,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15054,7 +14307,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15069,7 +14321,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15084,7 +14335,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15099,7 +14349,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15114,8 +14363,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15130,8 +14377,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15153,6 +14398,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15174,7 +14420,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15199,7 +14445,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15214,7 +14459,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15229,6 +14473,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15250,7 +14495,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15265,7 +14509,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15280,7 +14523,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15295,7 +14537,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15310,7 +14551,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15325,7 +14565,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15340,8 +14579,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15356,8 +14593,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15379,6 +14614,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15400,7 +14636,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15425,7 +14661,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15440,7 +14675,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15455,6 +14689,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15476,7 +14711,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15491,7 +14725,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15506,7 +14739,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15521,7 +14753,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15536,7 +14767,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15551,7 +14781,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15566,8 +14795,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15582,8 +14809,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15605,6 +14830,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15626,7 +14852,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15651,7 +14877,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15666,7 +14891,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15681,6 +14905,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15702,7 +14927,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15717,7 +14941,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15732,7 +14955,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15747,7 +14969,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15762,7 +14983,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15777,7 +14997,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15792,8 +15011,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15808,8 +15025,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15831,6 +15046,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15852,7 +15068,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15877,7 +15093,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15892,7 +15107,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15907,6 +15121,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15928,7 +15143,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15943,7 +15157,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15958,7 +15171,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15973,7 +15185,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15988,7 +15199,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16003,7 +15213,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16018,8 +15227,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16034,8 +15241,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16057,6 +15262,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16078,7 +15284,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16103,7 +15309,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16118,7 +15323,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16133,6 +15337,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16154,7 +15359,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16169,7 +15373,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16184,7 +15387,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16199,7 +15401,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16214,7 +15415,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16229,7 +15429,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16244,8 +15443,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16260,8 +15457,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16283,6 +15478,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16307,6 +15503,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16335,7 +15532,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16352,7 +15548,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16369,6 +15564,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16393,7 +15589,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16410,7 +15605,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16427,7 +15621,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16444,7 +15637,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16461,7 +15653,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16478,7 +15669,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16495,8 +15685,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16513,8 +15701,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16540,6 +15726,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16564,6 +15751,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16592,7 +15780,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16609,7 +15796,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16626,6 +15812,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16650,7 +15837,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16667,7 +15853,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16684,7 +15869,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16701,7 +15885,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16718,7 +15901,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16735,7 +15917,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16752,8 +15933,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16770,8 +15949,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16797,6 +15974,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16821,6 +15999,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16849,7 +16028,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16866,7 +16044,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16883,6 +16060,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16907,7 +16085,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16924,7 +16101,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16941,7 +16117,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16958,7 +16133,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16975,7 +16149,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16992,7 +16165,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17009,8 +16181,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17027,8 +16197,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17054,6 +16222,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17078,6 +16247,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17106,7 +16276,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17123,7 +16292,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17140,6 +16308,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17164,7 +16333,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17181,7 +16349,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17198,7 +16365,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17215,7 +16381,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17232,7 +16397,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17249,7 +16413,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17266,8 +16429,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17284,8 +16445,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17311,6 +16470,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17335,6 +16495,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17363,7 +16524,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17380,7 +16540,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17397,6 +16556,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17421,7 +16581,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17438,7 +16597,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17455,7 +16613,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17472,7 +16629,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17489,7 +16645,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17506,7 +16661,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17523,8 +16677,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17541,8 +16693,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17568,6 +16718,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17592,6 +16743,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17620,7 +16772,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17637,7 +16788,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17654,6 +16804,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17678,7 +16829,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17695,7 +16845,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17712,7 +16861,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17729,7 +16877,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17746,7 +16893,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17763,7 +16909,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17780,8 +16925,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17798,8 +16941,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17825,6 +16966,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17849,6 +16991,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17877,7 +17020,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17894,7 +17036,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17911,6 +17052,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17935,7 +17077,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17952,7 +17093,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17969,7 +17109,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17986,7 +17125,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18003,7 +17141,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18020,7 +17157,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18037,8 +17173,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18055,8 +17189,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18082,6 +17214,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18106,6 +17239,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18134,7 +17268,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18151,7 +17284,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18168,6 +17300,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18192,7 +17325,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18209,7 +17341,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18226,7 +17357,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18243,7 +17373,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18260,7 +17389,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18277,7 +17405,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18294,8 +17421,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18312,8 +17437,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18339,6 +17462,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18363,6 +17487,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18391,7 +17516,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18408,7 +17532,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18425,6 +17548,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18449,7 +17573,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18466,7 +17589,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18483,7 +17605,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18500,7 +17621,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18517,7 +17637,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18534,7 +17653,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18551,8 +17669,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18569,8 +17685,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18596,6 +17710,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18620,6 +17735,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18648,7 +17764,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18665,7 +17780,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18682,6 +17796,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18706,7 +17821,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18723,7 +17837,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18740,7 +17853,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18757,7 +17869,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18774,7 +17885,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18791,7 +17901,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18808,8 +17917,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18826,8 +17933,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18853,6 +17958,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18877,6 +17983,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18905,7 +18012,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18922,7 +18028,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18939,6 +18044,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18963,7 +18069,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18980,7 +18085,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18997,7 +18101,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19014,7 +18117,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19031,7 +18133,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19048,7 +18149,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19065,8 +18165,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19083,8 +18181,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19110,6 +18206,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19134,6 +18231,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19162,7 +18260,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19179,7 +18276,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19196,6 +18292,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19220,7 +18317,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19237,7 +18333,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19254,7 +18349,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19271,7 +18365,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19288,7 +18381,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19305,7 +18397,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19322,8 +18413,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19340,8 +18429,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19367,6 +18454,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19391,6 +18479,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19419,7 +18508,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19436,7 +18524,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19453,6 +18540,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19477,7 +18565,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19494,7 +18581,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19511,7 +18597,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19528,7 +18613,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19545,7 +18629,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19562,7 +18645,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19579,8 +18661,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19597,8 +18677,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19624,6 +18702,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19648,6 +18727,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19676,7 +18756,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19693,7 +18772,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19710,6 +18788,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19734,7 +18813,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19751,7 +18829,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19768,7 +18845,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19785,7 +18861,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19802,7 +18877,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19819,7 +18893,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19836,8 +18909,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19854,8 +18925,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19881,6 +18950,7 @@ entry: define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19905,6 +18975,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19933,7 +19004,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19950,7 +19020,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19967,6 +19036,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19991,7 +19061,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20008,7 +19077,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20025,7 +19093,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20042,7 +19109,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20059,7 +19125,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20076,7 +19141,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20093,8 +19157,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20111,8 +19173,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index ae5ec082024fd6..466276eea73bed 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -16,8 +16,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; GFX6-LABEL: global_system_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -40,9 +41,8 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX7-LABEL: global_system_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -55,11 +55,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX10-WGP-LABEL: global_system_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -68,11 +66,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX10-CU-LABEL: global_system_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -81,8 +77,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; SKIP-CACHE-INV-LABEL: global_system_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -105,11 +102,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -118,11 +113,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: global_system_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -131,11 +124,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -144,11 +135,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX940-TGSPLIT-LABEL: global_system_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -157,10 +146,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX11-WGP-LABEL: global_system_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -169,10 +157,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX11-CU-LABEL: global_system_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -181,11 +168,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX12-WGP-LABEL: global_system_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -194,11 +179,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX12-CU-LABEL: global_system_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -214,8 +197,9 @@ entry: define amdgpu_kernel void @global_system_monotonic_load( ; GFX6-LABEL: global_system_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -238,9 +222,8 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX7-LABEL: global_system_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -253,11 +236,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX10-WGP-LABEL: global_system_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -266,11 +247,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX10-CU-LABEL: global_system_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -279,8 +258,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -303,11 +283,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -316,11 +294,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -329,11 +305,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -342,11 +316,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -355,10 +327,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX11-WGP-LABEL: global_system_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -367,10 +338,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX11-CU-LABEL: global_system_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -379,11 +349,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX12-WGP-LABEL: global_system_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -392,11 +360,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX12-CU-LABEL: global_system_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -412,8 +378,9 @@ entry: define amdgpu_kernel void @global_system_acquire_load( ; GFX6-LABEL: global_system_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -437,9 +404,8 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX7-LABEL: global_system_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -453,11 +419,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX10-WGP-LABEL: global_system_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -468,11 +432,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX10-CU-LABEL: global_system_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -483,8 +445,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -507,11 +470,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -522,11 +483,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -537,11 +496,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -551,11 +508,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -565,10 +520,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX11-WGP-LABEL: global_system_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -579,10 +533,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX11-CU-LABEL: global_system_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -593,11 +546,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX12-WGP-LABEL: global_system_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -607,11 +558,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX12-CU-LABEL: global_system_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -628,8 +577,9 @@ entry: define amdgpu_kernel void @global_system_seq_cst_load( ; GFX6-LABEL: global_system_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -654,9 +604,8 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX7-LABEL: global_system_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -671,11 +620,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX10-WGP-LABEL: global_system_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc @@ -687,11 +634,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX10-CU-LABEL: global_system_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc @@ -703,8 +648,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -728,11 +674,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -743,11 +687,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -758,11 +700,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -772,11 +712,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -786,10 +724,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX11-WGP-LABEL: global_system_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc @@ -801,10 +738,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX11-CU-LABEL: global_system_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc @@ -816,11 +752,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX12-WGP-LABEL: global_system_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -836,11 +770,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX12-CU-LABEL: global_system_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 @@ -863,8 +795,9 @@ entry: define amdgpu_kernel void @global_system_unordered_store( ; GFX6-LABEL: global_system_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -880,8 +813,8 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX7-LABEL: global_system_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -891,10 +824,8 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX10-WGP-LABEL: global_system_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -903,10 +834,8 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX10-CU-LABEL: global_system_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -915,8 +844,9 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; SKIP-CACHE-INV-LABEL: global_system_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -932,10 +862,8 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -944,10 +872,8 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: global_system_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -956,10 +882,8 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -968,10 +892,8 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX940-TGSPLIT-LABEL: global_system_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -980,9 +902,8 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX11-WGP-LABEL: global_system_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -991,9 +912,8 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX11-CU-LABEL: global_system_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1002,10 +922,8 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX12-WGP-LABEL: global_system_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1014,10 +932,8 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX12-CU-LABEL: global_system_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1032,8 +948,9 @@ entry: define amdgpu_kernel void @global_system_monotonic_store( ; GFX6-LABEL: global_system_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1049,8 +966,8 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX7-LABEL: global_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1060,10 +977,8 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX10-WGP-LABEL: global_system_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1072,10 +987,8 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX10-CU-LABEL: global_system_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1084,8 +997,9 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1101,10 +1015,8 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1113,10 +1025,8 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1125,10 +1035,8 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1137,10 +1045,8 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1149,9 +1055,8 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX11-WGP-LABEL: global_system_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1160,9 +1065,8 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX11-CU-LABEL: global_system_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1171,10 +1075,8 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX12-WGP-LABEL: global_system_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1183,10 +1085,8 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX12-CU-LABEL: global_system_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1201,8 +1101,9 @@ entry: define amdgpu_kernel void @global_system_release_store( ; GFX6-LABEL: global_system_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1219,8 +1120,8 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX7-LABEL: global_system_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1231,10 +1132,8 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX10-WGP-LABEL: global_system_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1245,10 +1144,8 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX10-CU-LABEL: global_system_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1259,8 +1156,9 @@ define amdgpu_kernel void @global_system_release_store( ; ; SKIP-CACHE-INV-LABEL: global_system_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1277,10 +1175,8 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1291,10 +1187,8 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_system_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1305,10 +1199,8 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1319,10 +1211,8 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX940-TGSPLIT-LABEL: global_system_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1333,9 +1223,8 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX11-WGP-LABEL: global_system_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1346,9 +1235,8 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX11-CU-LABEL: global_system_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1359,10 +1247,8 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX12-WGP-LABEL: global_system_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1376,10 +1262,8 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX12-CU-LABEL: global_system_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1399,8 +1283,9 @@ entry: define amdgpu_kernel void @global_system_seq_cst_store( ; GFX6-LABEL: global_system_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1417,8 +1302,8 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX7-LABEL: global_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1429,10 +1314,8 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX10-WGP-LABEL: global_system_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1443,10 +1326,8 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX10-CU-LABEL: global_system_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1457,8 +1338,9 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1475,10 +1357,8 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1489,10 +1369,8 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1503,10 +1381,8 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1517,10 +1393,8 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1531,9 +1405,8 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX11-WGP-LABEL: global_system_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1544,9 +1417,8 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX11-CU-LABEL: global_system_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1557,10 +1429,8 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX12-WGP-LABEL: global_system_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1574,10 +1444,8 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX12-CU-LABEL: global_system_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1597,8 +1465,8 @@ entry: define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX6-LABEL: global_system_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1614,9 +1482,8 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX7-LABEL: global_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1627,9 +1494,8 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX10-WGP-LABEL: global_system_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1638,9 +1504,8 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX10-CU-LABEL: global_system_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1648,8 +1513,8 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1666,9 +1531,8 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1677,9 +1541,8 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1688,9 +1551,8 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1 @@ -1699,9 +1561,8 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1 @@ -1710,8 +1571,8 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX11-WGP-LABEL: global_system_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1720,8 +1581,8 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX11-CU-LABEL: global_system_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1730,8 +1591,8 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX12-WGP-LABEL: global_system_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS @@ -1740,8 +1601,8 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX12-CU-LABEL: global_system_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS @@ -1755,8 +1616,8 @@ entry: define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX6-LABEL: global_system_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1774,9 +1635,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX7-LABEL: global_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1789,9 +1649,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX10-WGP-LABEL: global_system_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1803,9 +1662,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX10-CU-LABEL: global_system_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1816,8 +1674,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1835,9 +1693,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1849,9 +1706,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1863,9 +1719,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1 @@ -1876,9 +1731,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1 @@ -1889,8 +1743,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX11-WGP-LABEL: global_system_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1902,8 +1756,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX11-CU-LABEL: global_system_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1915,8 +1769,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX12-WGP-LABEL: global_system_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS @@ -1927,8 +1781,8 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX12-CU-LABEL: global_system_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS @@ -1944,8 +1798,8 @@ entry: define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX6-LABEL: global_system_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1962,9 +1816,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX7-LABEL: global_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1976,9 +1829,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX10-WGP-LABEL: global_system_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1989,9 +1841,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX10-CU-LABEL: global_system_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2001,8 +1852,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2020,9 +1871,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 @@ -2033,9 +1883,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 @@ -2046,9 +1895,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -2059,9 +1907,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -2072,8 +1919,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX11-WGP-LABEL: global_system_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2084,8 +1931,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX11-CU-LABEL: global_system_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2096,8 +1943,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX12-WGP-LABEL: global_system_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS @@ -2111,8 +1958,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX12-CU-LABEL: global_system_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS @@ -2131,8 +1978,8 @@ entry: define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX6-LABEL: global_system_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2151,9 +1998,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -2167,9 +2013,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX10-WGP-LABEL: global_system_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2183,9 +2028,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX10-CU-LABEL: global_system_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2198,8 +2042,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2218,9 +2062,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 @@ -2234,9 +2077,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 @@ -2250,9 +2092,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -2265,9 +2106,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -2280,8 +2120,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: global_system_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2295,8 +2135,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX11-CU-LABEL: global_system_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2310,8 +2150,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: global_system_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS @@ -2327,8 +2167,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX12-CU-LABEL: global_system_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS @@ -2349,8 +2189,8 @@ entry: define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX6-LABEL: global_system_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2369,9 +2209,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -2385,9 +2224,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX10-WGP-LABEL: global_system_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2401,9 +2239,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX10-CU-LABEL: global_system_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2416,8 +2253,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2436,9 +2273,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 @@ -2452,9 +2288,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 @@ -2468,9 +2303,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -2483,9 +2317,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -2498,8 +2331,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: global_system_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2513,8 +2346,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX11-CU-LABEL: global_system_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2528,8 +2361,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: global_system_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS @@ -2545,8 +2378,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX12-CU-LABEL: global_system_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS @@ -2567,8 +2400,8 @@ entry: define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX6-LABEL: global_system_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2587,8 +2420,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2604,9 +2437,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: global_system_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2619,9 +2451,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: global_system_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2633,8 +2464,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2653,9 +2484,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2668,9 +2498,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2683,9 +2512,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1 @@ -2697,9 +2525,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1 @@ -2711,8 +2538,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: global_system_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2725,8 +2552,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: global_system_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2739,8 +2566,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: global_system_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -2752,8 +2579,8 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: global_system_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -2771,8 +2598,8 @@ entry: define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2792,8 +2619,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2810,9 +2637,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2827,9 +2653,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2843,8 +2668,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2864,9 +2689,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 @@ -2881,9 +2705,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 @@ -2898,9 +2721,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -2914,9 +2736,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -2930,8 +2751,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2946,8 +2767,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2962,8 +2783,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS @@ -2982,8 +2803,8 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS @@ -3008,8 +2829,8 @@ entry: define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -3029,8 +2850,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -3047,9 +2868,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3064,9 +2884,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3080,8 +2899,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -3101,9 +2920,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 @@ -3118,9 +2936,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 @@ -3135,9 +2952,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -3151,9 +2967,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -3167,8 +2982,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3183,8 +2998,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3199,8 +3014,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS @@ -3219,8 +3034,8 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS @@ -3245,6 +3060,7 @@ entry: define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3266,7 +3082,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3291,7 +3107,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3306,7 +3121,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3321,6 +3135,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3342,7 +3157,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3357,7 +3171,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3372,7 +3185,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3387,7 +3199,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3402,7 +3213,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3417,7 +3227,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3432,8 +3241,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3448,8 +3255,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3471,6 +3276,7 @@ entry: define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3494,7 +3300,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3521,7 +3327,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3539,7 +3344,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3557,6 +3361,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3579,7 +3384,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3597,7 +3401,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3615,7 +3418,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3632,7 +3434,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3649,7 +3450,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3667,7 +3467,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3685,8 +3484,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3703,8 +3500,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3728,6 +3523,7 @@ entry: define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX6-LABEL: global_system_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3750,7 +3546,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3776,7 +3572,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3793,7 +3588,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3810,6 +3604,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3832,7 +3627,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3849,7 +3643,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3866,7 +3659,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3883,7 +3675,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3900,7 +3691,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3917,7 +3707,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3934,8 +3723,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3955,8 +3742,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3983,6 +3768,7 @@ entry: define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4007,7 +3793,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4035,7 +3821,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4055,7 +3840,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4075,6 +3859,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4098,7 +3883,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4118,7 +3902,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4138,7 +3921,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4157,7 +3939,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4176,7 +3957,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4196,7 +3976,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4216,8 +3995,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4239,8 +4016,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4269,6 +4044,7 @@ entry: define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4293,7 +4069,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4321,7 +4097,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4341,7 +4116,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4361,6 +4135,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4384,7 +4159,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4404,7 +4178,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4424,7 +4197,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4443,7 +4215,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4462,7 +4233,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4482,7 +4252,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4502,8 +4271,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4525,8 +4292,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4555,6 +4320,7 @@ entry: define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4578,7 +4344,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4605,7 +4371,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4623,7 +4388,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4641,6 +4405,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4663,7 +4428,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4681,7 +4445,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4699,7 +4462,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4716,7 +4478,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4733,7 +4494,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4751,7 +4511,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4769,8 +4528,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4787,8 +4544,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4812,6 +4567,7 @@ entry: define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4835,7 +4591,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4862,7 +4618,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4880,7 +4635,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4898,6 +4652,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4920,7 +4675,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4938,7 +4692,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4956,7 +4709,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4973,7 +4725,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4990,7 +4741,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5008,7 +4758,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5026,8 +4775,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5044,8 +4791,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5069,6 +4814,7 @@ entry: define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX6-LABEL: global_system_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5093,7 +4839,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5121,7 +4867,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5141,7 +4886,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5161,6 +4905,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5184,7 +4929,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5204,7 +4948,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5224,7 +4967,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5243,7 +4985,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5262,7 +5003,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5282,7 +5022,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5302,8 +5041,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5325,8 +5062,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5355,6 +5090,7 @@ entry: define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5379,7 +5115,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5407,7 +5143,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5427,7 +5162,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5447,6 +5181,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5470,7 +5205,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5490,7 +5224,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5510,7 +5243,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5529,7 +5261,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5548,7 +5279,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5568,7 +5298,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5588,8 +5317,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5611,8 +5338,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5641,6 +5366,7 @@ entry: define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5665,7 +5391,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5693,7 +5419,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5713,7 +5438,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5733,6 +5457,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5756,7 +5481,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5776,7 +5500,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5796,7 +5519,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5815,7 +5537,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5834,7 +5555,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5854,7 +5574,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5874,8 +5593,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5897,8 +5614,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5927,6 +5642,7 @@ entry: define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5951,7 +5667,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5979,7 +5695,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5999,7 +5714,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6019,6 +5733,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6042,7 +5757,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6062,7 +5776,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6082,7 +5795,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6101,7 +5813,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6120,7 +5831,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6140,7 +5850,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6160,8 +5869,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6183,8 +5890,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6213,6 +5918,7 @@ entry: define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6237,6 +5943,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6265,7 +5972,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6282,7 +5988,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6299,6 +6004,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6323,7 +6029,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6340,7 +6045,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6357,7 +6061,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6374,7 +6077,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6391,7 +6093,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6408,7 +6109,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6425,8 +6125,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6443,8 +6141,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6470,6 +6166,7 @@ entry: define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6495,6 +6192,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6524,7 +6222,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6543,7 +6240,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6562,6 +6258,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6587,7 +6284,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6606,7 +6302,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6625,7 +6320,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6643,7 +6337,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6661,7 +6354,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6680,7 +6372,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6699,8 +6390,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6718,8 +6407,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6746,6 +6433,7 @@ entry: define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6772,6 +6460,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6802,7 +6491,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6823,7 +6511,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6844,6 +6531,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6870,7 +6558,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6891,7 +6578,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6912,7 +6598,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6932,7 +6617,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6952,7 +6636,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6973,7 +6656,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6994,8 +6676,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7020,8 +6700,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7055,6 +6733,7 @@ entry: define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7081,6 +6760,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7111,7 +6791,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7132,7 +6811,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7153,6 +6831,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7179,7 +6858,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7200,7 +6878,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7221,7 +6898,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7241,7 +6917,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7261,7 +6936,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7282,7 +6956,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7303,8 +6976,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7329,8 +7000,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7364,6 +7033,7 @@ entry: define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7389,6 +7059,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7418,7 +7089,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7437,7 +7107,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7456,6 +7125,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7481,7 +7151,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7500,7 +7169,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7519,7 +7187,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7537,7 +7204,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7555,7 +7221,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7574,7 +7239,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7593,8 +7257,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7614,8 +7276,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7644,6 +7304,7 @@ entry: define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7669,6 +7330,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7698,7 +7360,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7717,7 +7378,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7736,6 +7396,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7761,7 +7422,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7780,7 +7440,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7799,7 +7458,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7817,7 +7475,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7835,7 +7492,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7854,7 +7510,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7873,8 +7528,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7892,8 +7545,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7920,6 +7571,7 @@ entry: define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7946,6 +7598,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7976,7 +7629,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7997,7 +7649,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8018,6 +7669,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8044,7 +7696,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8065,7 +7716,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8086,7 +7736,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8106,7 +7755,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8126,7 +7774,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8147,7 +7794,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8168,8 +7814,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8194,8 +7838,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8229,6 +7871,7 @@ entry: define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8255,6 +7898,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8285,7 +7929,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8306,7 +7949,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8327,6 +7969,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8353,7 +7996,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8374,7 +8016,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8395,7 +8036,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8415,7 +8055,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8435,7 +8074,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8456,7 +8094,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8477,8 +8114,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8503,8 +8138,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8538,6 +8171,7 @@ entry: define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8564,6 +8198,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8594,7 +8229,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8615,7 +8249,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8636,6 +8269,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8662,7 +8296,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8683,7 +8316,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8704,7 +8336,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8724,7 +8355,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8744,7 +8374,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8765,7 +8394,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8786,8 +8414,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8812,8 +8438,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8847,6 +8471,7 @@ entry: define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8873,6 +8498,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8903,7 +8529,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8924,7 +8549,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8945,6 +8569,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8971,7 +8596,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8992,7 +8616,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9013,7 +8636,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9033,7 +8655,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9053,7 +8674,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9074,7 +8694,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9095,8 +8714,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9121,8 +8738,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9156,6 +8771,7 @@ entry: define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9182,6 +8798,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9212,7 +8829,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9233,7 +8849,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9254,6 +8869,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9280,7 +8896,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9301,7 +8916,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9322,7 +8936,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9342,7 +8955,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9362,7 +8974,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9383,7 +8994,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9404,8 +9014,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9428,8 +9036,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9461,6 +9067,7 @@ entry: define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9487,6 +9094,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9517,7 +9125,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9538,7 +9145,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9559,6 +9165,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9585,7 +9192,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9606,7 +9212,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9627,7 +9232,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9647,7 +9251,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9667,7 +9270,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9688,7 +9290,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9709,8 +9310,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9735,8 +9334,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9770,6 +9367,7 @@ entry: define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9796,6 +9394,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9826,7 +9425,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9847,7 +9445,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9868,6 +9465,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9894,7 +9492,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9915,7 +9512,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9936,7 +9532,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9956,7 +9551,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9976,7 +9570,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9997,7 +9590,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10018,8 +9610,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10044,8 +9634,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10079,6 +9667,7 @@ entry: define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10105,6 +9694,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10135,7 +9725,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10156,7 +9745,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10177,6 +9765,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10203,7 +9792,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10224,7 +9812,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10245,7 +9832,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10265,7 +9851,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10285,7 +9870,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10306,7 +9890,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10327,8 +9910,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10353,8 +9934,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10388,8 +9967,9 @@ entry: define amdgpu_kernel void @global_system_one_as_unordered_load( ; GFX6-LABEL: global_system_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -10412,9 +9992,8 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX7-LABEL: global_system_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10427,11 +10006,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX10-WGP-LABEL: global_system_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10440,11 +10017,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX10-CU-LABEL: global_system_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -10453,8 +10028,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10477,11 +10053,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10490,11 +10064,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10503,11 +10075,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10516,11 +10086,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10529,10 +10097,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX11-WGP-LABEL: global_system_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10541,10 +10108,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX11-CU-LABEL: global_system_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -10553,11 +10119,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX12-WGP-LABEL: global_system_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10566,11 +10130,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX12-CU-LABEL: global_system_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10586,8 +10148,9 @@ entry: define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX6-LABEL: global_system_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -10610,9 +10173,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX7-LABEL: global_system_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10625,11 +10187,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10638,11 +10198,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -10651,8 +10209,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10675,11 +10234,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10688,11 +10245,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10701,11 +10256,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10714,11 +10267,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10727,10 +10278,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10739,10 +10289,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -10751,11 +10300,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10764,11 +10311,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10784,8 +10329,9 @@ entry: define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX6-LABEL: global_system_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -10809,9 +10355,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX7-LABEL: global_system_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10825,11 +10370,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10840,11 +10383,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -10855,8 +10396,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10879,11 +10421,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10894,11 +10434,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10909,11 +10447,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10923,11 +10459,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10937,10 +10471,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10951,10 +10484,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -10965,11 +10497,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10979,11 +10509,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11000,8 +10528,9 @@ entry: define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX6-LABEL: global_system_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -11026,9 +10555,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX7-LABEL: global_system_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11043,11 +10571,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc @@ -11059,11 +10585,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc @@ -11075,8 +10599,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11100,11 +10625,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11115,11 +10638,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11130,11 +10651,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11144,11 +10663,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11158,10 +10675,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc @@ -11173,10 +10689,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc @@ -11188,11 +10703,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11208,11 +10721,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 ; GFX12-CU-NEXT: s_wait_samplecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11235,8 +10746,9 @@ entry: define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX6-LABEL: global_system_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11252,8 +10764,8 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX7-LABEL: global_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11263,10 +10775,8 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX10-WGP-LABEL: global_system_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -11275,10 +10785,8 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX10-CU-LABEL: global_system_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -11287,8 +10795,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11304,10 +10813,8 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11316,10 +10823,8 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11328,10 +10833,8 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11340,10 +10843,8 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11352,9 +10853,8 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX11-WGP-LABEL: global_system_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11363,9 +10863,8 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX11-CU-LABEL: global_system_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11374,10 +10873,8 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX12-WGP-LABEL: global_system_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11386,10 +10883,8 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX12-CU-LABEL: global_system_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11404,8 +10899,9 @@ entry: define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX6-LABEL: global_system_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11421,8 +10917,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX7-LABEL: global_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11432,10 +10928,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -11444,10 +10938,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -11456,8 +10948,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11473,10 +10966,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11485,10 +10976,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11497,10 +10986,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11509,10 +10996,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11521,9 +11006,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11532,9 +11016,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11543,10 +11026,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11555,10 +11036,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11573,8 +11052,9 @@ entry: define amdgpu_kernel void @global_system_one_as_release_store( ; GFX6-LABEL: global_system_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11591,8 +11071,8 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX7-LABEL: global_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11603,10 +11083,8 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX10-WGP-LABEL: global_system_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -11617,10 +11095,8 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX10-CU-LABEL: global_system_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -11631,8 +11107,9 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11649,10 +11126,8 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11663,10 +11138,8 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11677,10 +11150,8 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11691,10 +11162,8 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11705,9 +11174,8 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX11-WGP-LABEL: global_system_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11718,9 +11186,8 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX11-CU-LABEL: global_system_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11731,10 +11198,8 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX12-WGP-LABEL: global_system_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11748,10 +11213,8 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX12-CU-LABEL: global_system_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11771,8 +11234,9 @@ entry: define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX6-LABEL: global_system_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11789,8 +11253,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX7-LABEL: global_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11801,10 +11265,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -11815,10 +11277,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -11829,8 +11289,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11847,10 +11308,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11861,10 +11320,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11875,10 +11332,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11889,10 +11344,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11903,9 +11356,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11916,9 +11368,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11929,10 +11380,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11946,10 +11395,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11969,8 +11416,8 @@ entry: define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11986,9 +11433,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11999,9 +11445,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX10-WGP-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12010,9 +11455,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX10-CU-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12020,8 +11464,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12038,9 +11482,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12049,9 +11492,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12060,9 +11502,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1 @@ -12071,9 +11512,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1 @@ -12082,8 +11522,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX11-WGP-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12092,8 +11532,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX11-CU-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12102,8 +11542,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS @@ -12112,8 +11552,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS @@ -12127,8 +11567,8 @@ entry: define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12146,9 +11586,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12161,9 +11600,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX10-WGP-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12175,9 +11613,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX10-CU-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12188,8 +11625,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12207,9 +11644,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12221,9 +11657,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12235,9 +11670,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1 @@ -12248,9 +11682,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1 @@ -12261,8 +11694,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX11-WGP-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12274,8 +11707,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX11-CU-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12287,8 +11720,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS @@ -12299,8 +11732,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS @@ -12316,8 +11749,8 @@ entry: define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX6-LABEL: global_system_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12334,9 +11767,8 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12348,9 +11780,8 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX10-WGP-LABEL: global_system_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -12361,9 +11792,8 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX10-CU-LABEL: global_system_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -12373,8 +11803,8 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12392,9 +11822,8 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 @@ -12405,9 +11834,8 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 @@ -12418,9 +11846,8 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -12431,9 +11858,8 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -12444,8 +11870,8 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX11-WGP-LABEL: global_system_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -12456,8 +11882,8 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX11-CU-LABEL: global_system_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -12468,8 +11894,8 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: global_system_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS @@ -12483,8 +11909,8 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX12-CU-LABEL: global_system_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS @@ -12503,8 +11929,8 @@ entry: define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12523,9 +11949,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12539,9 +11964,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -12555,9 +11979,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX10-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -12570,8 +11993,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12590,9 +12013,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 @@ -12606,9 +12028,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 @@ -12622,9 +12043,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -12637,9 +12057,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -12652,8 +12071,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -12667,8 +12086,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX11-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -12682,8 +12101,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS @@ -12699,8 +12118,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS @@ -12721,8 +12140,8 @@ entry: define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12741,9 +12160,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12757,9 +12175,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -12773,9 +12190,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX10-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -12788,8 +12204,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12808,9 +12224,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 @@ -12824,9 +12239,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 @@ -12840,9 +12254,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -12855,9 +12268,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -12870,8 +12282,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -12885,8 +12297,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX11-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -12900,8 +12312,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS @@ -12917,8 +12329,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS @@ -12939,8 +12351,8 @@ entry: define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12959,8 +12371,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12976,9 +12388,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12991,9 +12402,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -13005,8 +12415,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -13025,9 +12435,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -13040,9 +12449,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -13055,9 +12463,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1 @@ -13069,9 +12476,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1 @@ -13083,8 +12489,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -13097,8 +12503,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -13111,8 +12517,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13124,8 +12530,8 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13143,8 +12549,8 @@ entry: define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -13164,8 +12570,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -13182,9 +12588,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13199,9 +12604,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -13215,8 +12619,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -13236,9 +12640,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 @@ -13253,9 +12656,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 @@ -13270,9 +12672,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -13286,9 +12687,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -13302,8 +12702,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13318,8 +12718,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -13334,8 +12734,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS @@ -13354,8 +12754,8 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS @@ -13380,8 +12780,8 @@ entry: define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -13401,8 +12801,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -13419,9 +12819,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13436,9 +12835,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -13452,8 +12850,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -13473,9 +12871,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 @@ -13490,9 +12887,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 @@ -13507,9 +12903,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -13523,9 +12918,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 @@ -13539,8 +12933,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13555,8 +12949,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -13571,8 +12965,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS @@ -13591,8 +12985,8 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS @@ -13617,6 +13011,7 @@ entry: define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -13638,7 +13033,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13663,7 +13058,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13678,7 +13072,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13693,6 +13086,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -13714,7 +13108,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13729,7 +13122,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13744,7 +13136,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13759,7 +13150,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13774,7 +13164,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13789,7 +13178,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13804,8 +13192,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13820,8 +13206,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13843,6 +13227,7 @@ entry: define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -13866,7 +13251,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13893,7 +13278,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13911,7 +13295,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13929,6 +13312,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -13951,7 +13335,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13969,7 +13352,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13987,7 +13369,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14004,7 +13385,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14021,7 +13401,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14039,7 +13418,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14057,8 +13435,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14075,8 +13451,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14100,6 +13474,7 @@ entry: define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14122,7 +13497,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14148,7 +13523,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14165,7 +13539,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14182,6 +13555,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14204,7 +13578,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14221,7 +13594,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14238,7 +13610,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14255,7 +13626,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14272,7 +13642,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14289,7 +13658,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14306,8 +13674,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14327,8 +13693,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14355,6 +13719,7 @@ entry: define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14379,7 +13744,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14407,7 +13772,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14427,7 +13791,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14447,6 +13810,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14470,7 +13834,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14490,7 +13853,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14510,7 +13872,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14529,7 +13890,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14548,7 +13908,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14568,7 +13927,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14588,8 +13946,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14611,8 +13967,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14641,6 +13995,7 @@ entry: define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14665,7 +14020,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14693,7 +14048,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14713,7 +14067,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14733,6 +14086,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14756,7 +14110,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14776,7 +14129,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14796,7 +14148,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14815,7 +14166,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14834,7 +14184,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14854,7 +14203,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14874,8 +14222,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14897,8 +14243,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14927,6 +14271,7 @@ entry: define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14950,7 +14295,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14977,7 +14322,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14995,7 +14339,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15013,6 +14356,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15035,7 +14379,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15053,7 +14396,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15071,7 +14413,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15088,7 +14429,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15105,7 +14445,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15123,7 +14462,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15141,8 +14479,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15159,8 +14495,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15184,6 +14518,7 @@ entry: define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15207,7 +14542,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15234,7 +14569,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15252,7 +14586,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15270,6 +14603,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15292,7 +14626,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15310,7 +14643,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15328,7 +14660,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15345,7 +14676,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15362,7 +14692,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15380,7 +14709,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15398,8 +14726,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15416,8 +14742,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15441,6 +14765,7 @@ entry: define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15465,7 +14790,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15493,7 +14818,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15513,7 +14837,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15533,6 +14856,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15556,7 +14880,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15576,7 +14899,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15596,7 +14918,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15615,7 +14936,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15634,7 +14954,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15654,7 +14973,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15674,8 +14992,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15697,8 +15013,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15727,6 +15041,7 @@ entry: define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15751,7 +15066,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15779,7 +15094,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15799,7 +15113,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15819,6 +15132,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15842,7 +15156,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15862,7 +15175,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15882,7 +15194,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15901,7 +15212,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15920,7 +15230,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15940,7 +15249,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15960,8 +15268,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15983,8 +15289,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16013,6 +15317,7 @@ entry: define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16037,7 +15342,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16065,7 +15370,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16085,7 +15389,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16105,6 +15408,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16128,7 +15432,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16148,7 +15451,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16168,7 +15470,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16187,7 +15488,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16206,7 +15506,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16226,7 +15525,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16246,8 +15544,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16269,8 +15565,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16299,6 +15593,7 @@ entry: define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16323,7 +15618,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16351,7 +15646,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16371,7 +15665,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16391,6 +15684,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16414,7 +15708,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16434,7 +15727,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16454,7 +15746,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16473,7 +15764,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16492,7 +15782,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16512,7 +15801,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16532,8 +15820,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16555,8 +15841,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16585,6 +15869,7 @@ entry: define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16609,7 +15894,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16637,7 +15922,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16657,7 +15941,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16677,6 +15960,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16700,7 +15984,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16720,7 +16003,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16740,7 +16022,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16759,7 +16040,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16778,7 +16058,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16798,7 +16077,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16818,8 +16096,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16841,8 +16117,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16871,6 +16145,7 @@ entry: define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16895,7 +16170,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16923,7 +16198,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16943,7 +16217,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16963,6 +16236,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16986,7 +16260,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17006,7 +16279,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17026,7 +16298,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17045,7 +16316,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17064,7 +16334,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17084,7 +16353,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17104,8 +16372,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17127,8 +16393,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17157,6 +16421,7 @@ entry: define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17181,7 +16446,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17209,7 +16474,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17229,7 +16493,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17249,6 +16512,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17272,7 +16536,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17292,7 +16555,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17312,7 +16574,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17331,7 +16592,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17350,7 +16610,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17370,7 +16629,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17390,8 +16648,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17413,8 +16669,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17443,6 +16697,7 @@ entry: define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17467,7 +16722,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17495,7 +16750,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17515,7 +16769,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17535,6 +16788,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17558,7 +16812,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17578,7 +16831,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17598,7 +16850,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17617,7 +16868,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17636,7 +16886,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17656,7 +16905,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17676,8 +16924,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17699,8 +16945,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17729,6 +16973,7 @@ entry: define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17753,6 +16998,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17781,7 +17027,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17798,7 +17043,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17815,6 +17059,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17839,7 +17084,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17856,7 +17100,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17873,7 +17116,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17890,7 +17132,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17907,7 +17148,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17924,7 +17164,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17941,8 +17180,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17959,8 +17196,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17986,6 +17221,7 @@ entry: define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18011,6 +17247,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18040,7 +17277,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18059,7 +17295,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18078,6 +17313,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18103,7 +17339,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18122,7 +17357,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18141,7 +17375,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18159,7 +17392,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18177,7 +17409,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18196,7 +17427,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18215,8 +17445,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18234,8 +17462,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18262,6 +17488,7 @@ entry: define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18287,6 +17514,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18316,7 +17544,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18335,7 +17562,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18354,6 +17580,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18379,7 +17606,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18398,7 +17624,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18417,7 +17642,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18436,7 +17660,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18455,7 +17678,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18474,7 +17696,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18493,8 +17714,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18516,8 +17735,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18548,6 +17765,7 @@ entry: define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18574,6 +17792,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18604,7 +17823,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18625,7 +17843,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18646,6 +17863,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18672,7 +17890,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18693,7 +17910,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18714,7 +17930,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18734,7 +17949,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18754,7 +17968,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18775,7 +17988,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18796,8 +18008,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18822,8 +18032,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18857,6 +18065,7 @@ entry: define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18883,6 +18092,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18913,7 +18123,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18934,7 +18143,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18955,6 +18163,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18981,7 +18190,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19002,7 +18210,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19023,7 +18230,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19043,7 +18249,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19063,7 +18268,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19084,7 +18288,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19105,8 +18308,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19131,8 +18332,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19166,6 +18365,7 @@ entry: define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19191,6 +18391,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19220,7 +18421,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19239,7 +18439,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19258,6 +18457,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19283,7 +18483,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19302,7 +18501,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19321,7 +18519,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19339,7 +18536,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19357,7 +18553,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19376,7 +18571,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19395,8 +18589,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19416,8 +18608,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19446,6 +18636,7 @@ entry: define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19471,6 +18662,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19500,7 +18692,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19519,7 +18710,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19538,6 +18728,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19563,7 +18754,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19582,7 +18772,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19601,7 +18790,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19619,7 +18807,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19637,7 +18824,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19656,7 +18842,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19675,8 +18860,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19694,8 +18877,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19722,6 +18903,7 @@ entry: define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19748,6 +18930,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19778,7 +18961,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19799,7 +18981,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19820,6 +19001,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19846,7 +19028,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19867,7 +19048,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19888,7 +19068,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19908,7 +19087,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19928,7 +19106,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19949,7 +19126,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19970,8 +19146,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19996,8 +19170,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20031,6 +19203,7 @@ entry: define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20057,6 +19230,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20087,7 +19261,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20108,7 +19281,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20129,6 +19301,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20155,7 +19328,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20176,7 +19348,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20197,7 +19368,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20217,7 +19387,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20237,7 +19406,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20258,7 +19426,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20279,8 +19446,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20305,8 +19470,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20340,6 +19503,7 @@ entry: define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20366,6 +19530,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20396,7 +19561,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20417,7 +19581,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20438,6 +19601,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20464,7 +19628,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20485,7 +19648,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20506,7 +19668,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20526,7 +19687,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20546,7 +19706,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20567,7 +19726,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20588,8 +19746,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20614,8 +19770,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20649,6 +19803,7 @@ entry: define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20675,6 +19830,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20705,7 +19861,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20726,7 +19881,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20747,6 +19901,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20773,7 +19928,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20794,7 +19948,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20815,7 +19968,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20835,7 +19987,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20855,7 +20006,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20876,7 +20026,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20897,8 +20046,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20923,8 +20070,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20958,6 +20103,7 @@ entry: define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20984,6 +20130,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21014,7 +20161,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21035,7 +20181,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21056,6 +20201,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21082,7 +20228,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21103,7 +20248,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21124,7 +20268,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21144,7 +20287,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21164,7 +20306,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21185,7 +20326,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21206,8 +20346,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21230,8 +20368,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21263,6 +20399,7 @@ entry: define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21289,6 +20426,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21319,7 +20457,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21340,7 +20477,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21361,6 +20497,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21387,7 +20524,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21408,7 +20544,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21429,7 +20564,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21449,7 +20583,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21469,7 +20602,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21490,7 +20622,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21511,8 +20642,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21537,8 +20666,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21572,6 +20699,7 @@ entry: define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21598,6 +20726,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21628,7 +20757,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21649,7 +20777,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21670,6 +20797,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21696,7 +20824,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21717,7 +20844,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21738,7 +20864,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21758,7 +20883,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21778,7 +20902,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21799,7 +20922,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21820,8 +20942,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21846,8 +20966,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21881,6 +20999,7 @@ entry: define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21907,6 +21026,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21937,7 +21057,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21958,7 +21077,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21979,6 +21097,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -22005,7 +21124,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22026,7 +21144,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -22047,7 +21164,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -22067,7 +21183,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -22087,7 +21202,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22108,7 +21222,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22129,8 +21242,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22155,8 +21266,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 29d57f9ceaa4c6..2bf2e03cb0bd79 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -12,8 +12,9 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX6-LABEL: global_volatile_load_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s2, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -36,9 +37,8 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX7-LABEL: global_volatile_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -51,11 +51,9 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX10-WGP-LABEL: global_volatile_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -64,11 +62,9 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX10-CU-LABEL: global_volatile_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -77,8 +73,9 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; SKIP-CACHE-INV-LABEL: global_volatile_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -101,10 +98,9 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX11-WGP-LABEL: global_volatile_load_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -113,10 +109,9 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX11-CU-LABEL: global_volatile_load_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -125,11 +120,9 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX12-WGP-LABEL: global_volatile_load_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -140,11 +133,9 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX12-CU-LABEL: global_volatile_load_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 @@ -162,8 +153,9 @@ entry: define amdgpu_kernel void @global_volatile_load_1( ; GFX6-LABEL: global_volatile_load_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s8, s1 ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -192,8 +184,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX7-LABEL: global_volatile_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -220,11 +213,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX10-WGP-LABEL: global_volatile_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_mov_b32 s8, 2 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s8, v1 @@ -236,11 +227,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX10-CU-LABEL: global_volatile_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_mov_b32 s8, 2 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s8, v1 @@ -252,8 +241,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; SKIP-CACHE-INV-LABEL: global_volatile_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -282,10 +272,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX11-WGP-LABEL: global_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_mov_b32 s4, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s4 @@ -299,10 +288,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX11-CU-LABEL: global_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_mov_b32 s4, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s4 @@ -316,11 +304,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX12-WGP-LABEL: global_volatile_load_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -338,11 +324,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX12-CU-LABEL: global_volatile_load_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -369,8 +353,9 @@ entry: define amdgpu_kernel void @global_volatile_store_0( ; GFX6-LABEL: global_volatile_store_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s8, s1 ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -389,8 +374,8 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX7-LABEL: global_volatile_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -403,10 +388,8 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX10-WGP-LABEL: global_volatile_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0 @@ -418,10 +401,8 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX10-CU-LABEL: global_volatile_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0 @@ -433,8 +414,9 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; SKIP-CACHE-INV-LABEL: global_volatile_store_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -453,9 +435,8 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX11-WGP-LABEL: global_volatile_store_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -467,9 +448,8 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX11-CU-LABEL: global_volatile_store_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -481,10 +461,8 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX12-WGP-LABEL: global_volatile_store_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -501,10 +479,8 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX12-CU-LABEL: global_volatile_store_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -528,9 +504,8 @@ entry: define amdgpu_kernel void @global_volatile_store_1( ; GFX6-LABEL: global_volatile_store_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0xf000 @@ -554,8 +529,8 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX7-LABEL: global_volatile_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s5, 2 @@ -582,10 +557,8 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX10-WGP-LABEL: global_volatile_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_mov_b32 s7, 2 @@ -598,10 +571,8 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX10-CU-LABEL: global_volatile_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX10-CU-NEXT: s_mov_b32 s7, 2 @@ -614,9 +585,8 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; SKIP-CACHE-INV-LABEL: global_volatile_store_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0xf000 @@ -640,9 +610,8 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX11-WGP-LABEL: global_volatile_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff @@ -657,9 +626,8 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX11-CU-LABEL: global_volatile_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff @@ -674,10 +642,8 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX12-WGP-LABEL: global_volatile_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff @@ -699,10 +665,8 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX12-CU-LABEL: global_volatile_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff @@ -733,8 +697,9 @@ entry: define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX6-LABEL: global_volatile_workgroup_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s2, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -757,9 +722,8 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX7-LABEL: global_volatile_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -772,11 +736,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: global_volatile_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -786,11 +748,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX10-CU-LABEL: global_volatile_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -799,8 +759,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -823,10 +784,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX11-WGP-LABEL: global_volatile_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -836,10 +796,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX11-CU-LABEL: global_volatile_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -848,11 +807,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX12-WGP-LABEL: global_volatile_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -862,11 +819,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX12-CU-LABEL: global_volatile_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -882,8 +837,9 @@ entry: define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX6-LABEL: global_volatile_workgroup_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s7, s1 ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -900,8 +856,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX7-LABEL: global_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -912,10 +868,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX10-WGP-LABEL: global_volatile_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -926,10 +880,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX10-CU-LABEL: global_volatile_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -939,8 +891,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -957,9 +910,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX11-WGP-LABEL: global_volatile_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -970,9 +922,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX11-CU-LABEL: global_volatile_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -982,10 +933,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX12-WGP-LABEL: global_volatile_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -998,10 +947,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX12-CU-LABEL: global_volatile_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index aaa11c0455606f..08682786f7b256 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -16,8 +16,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX6-LABEL: global_wavefront_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -40,9 +41,8 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX7-LABEL: global_wavefront_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -55,11 +55,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX10-WGP-LABEL: global_wavefront_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -68,11 +66,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX10-CU-LABEL: global_wavefront_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -81,8 +77,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -105,11 +102,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -118,11 +113,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -131,11 +124,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -144,11 +135,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -157,10 +146,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX11-WGP-LABEL: global_wavefront_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -169,10 +157,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX11-CU-LABEL: global_wavefront_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -181,11 +168,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX12-WGP-LABEL: global_wavefront_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -194,11 +179,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX12-CU-LABEL: global_wavefront_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -214,8 +197,9 @@ entry: define amdgpu_kernel void @global_wavefront_monotonic_load( ; GFX6-LABEL: global_wavefront_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -238,9 +222,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX7-LABEL: global_wavefront_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -253,11 +236,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -266,11 +247,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -279,8 +258,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -303,11 +283,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -316,11 +294,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -329,11 +305,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -342,11 +316,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -355,10 +327,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -367,10 +338,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -379,11 +349,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -392,11 +360,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -412,8 +378,9 @@ entry: define amdgpu_kernel void @global_wavefront_acquire_load( ; GFX6-LABEL: global_wavefront_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -436,9 +403,8 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX7-LABEL: global_wavefront_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -451,11 +417,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -464,11 +428,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX10-CU-LABEL: global_wavefront_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -477,8 +439,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -501,11 +464,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -514,11 +475,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -527,11 +486,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -540,11 +497,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -553,10 +508,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -565,10 +519,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX11-CU-LABEL: global_wavefront_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -577,11 +530,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -590,11 +541,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX12-CU-LABEL: global_wavefront_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -610,8 +559,9 @@ entry: define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX6-LABEL: global_wavefront_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -634,9 +584,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX7-LABEL: global_wavefront_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -649,11 +598,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -662,11 +609,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -675,8 +620,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -699,11 +645,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -712,11 +656,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -725,11 +667,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -738,11 +678,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -751,10 +689,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -763,10 +700,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -775,11 +711,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -788,11 +722,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -808,8 +740,9 @@ entry: define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX6-LABEL: global_wavefront_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -825,8 +758,8 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX7-LABEL: global_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -836,10 +769,8 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX10-WGP-LABEL: global_wavefront_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -848,10 +779,8 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX10-CU-LABEL: global_wavefront_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -860,8 +789,9 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -877,10 +807,8 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -889,10 +817,8 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -901,10 +827,8 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -913,10 +837,8 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -925,9 +847,8 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX11-WGP-LABEL: global_wavefront_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -936,9 +857,8 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX11-CU-LABEL: global_wavefront_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -947,10 +867,8 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX12-WGP-LABEL: global_wavefront_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -959,10 +877,8 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX12-CU-LABEL: global_wavefront_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -977,8 +893,9 @@ entry: define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX6-LABEL: global_wavefront_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -994,8 +911,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX7-LABEL: global_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1005,10 +922,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1017,10 +932,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1029,8 +942,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1046,10 +960,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1058,10 +970,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1070,10 +980,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1082,10 +990,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1094,9 +1000,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1105,9 +1010,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1116,10 +1020,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1128,10 +1030,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1146,8 +1046,9 @@ entry: define amdgpu_kernel void @global_wavefront_release_store( ; GFX6-LABEL: global_wavefront_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1163,8 +1064,8 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX7-LABEL: global_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1174,10 +1075,8 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX10-WGP-LABEL: global_wavefront_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1186,10 +1085,8 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX10-CU-LABEL: global_wavefront_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1198,8 +1095,9 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1215,10 +1113,8 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1227,10 +1123,8 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1239,10 +1133,8 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1251,10 +1143,8 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1263,9 +1153,8 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX11-WGP-LABEL: global_wavefront_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1274,9 +1163,8 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX11-CU-LABEL: global_wavefront_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1285,10 +1173,8 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX12-WGP-LABEL: global_wavefront_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1297,10 +1183,8 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX12-CU-LABEL: global_wavefront_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1315,8 +1199,9 @@ entry: define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX6-LABEL: global_wavefront_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1332,8 +1217,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX7-LABEL: global_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1343,10 +1228,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1355,10 +1238,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1367,8 +1248,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1384,10 +1266,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1396,10 +1276,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1408,10 +1286,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1420,10 +1296,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1432,9 +1306,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1443,9 +1316,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1454,10 +1326,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1466,10 +1336,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1484,8 +1352,8 @@ entry: define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX6-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1501,9 +1369,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX7-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1514,9 +1381,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1525,9 +1391,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1535,8 +1400,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1553,9 +1418,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1564,9 +1428,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1575,9 +1438,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1586,9 +1448,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1597,8 +1458,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1607,8 +1468,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1617,8 +1478,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1627,8 +1488,8 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1642,8 +1503,8 @@ entry: define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX6-LABEL: global_wavefront_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1659,9 +1520,8 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1672,9 +1532,8 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1683,9 +1542,8 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1693,8 +1551,8 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1711,9 +1569,8 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1722,9 +1579,8 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1733,9 +1589,8 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1744,9 +1599,8 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1755,8 +1609,8 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1765,8 +1619,8 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1775,8 +1629,8 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1785,8 +1639,8 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1800,8 +1654,8 @@ entry: define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX6-LABEL: global_wavefront_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1817,9 +1671,8 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX7-LABEL: global_wavefront_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1830,9 +1683,8 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1841,9 +1693,8 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1851,8 +1702,8 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1869,9 +1720,8 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1880,9 +1730,8 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1891,9 +1740,8 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1902,9 +1750,8 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1913,8 +1760,8 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1923,8 +1770,8 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1933,8 +1780,8 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1943,8 +1790,8 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1958,8 +1805,8 @@ entry: define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX6-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1975,9 +1822,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1988,9 +1834,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1999,9 +1844,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2009,8 +1853,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2027,9 +1871,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2038,9 +1881,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2049,9 +1891,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -2060,9 +1901,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -2071,8 +1911,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2081,8 +1921,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2091,8 +1931,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2101,8 +1941,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2116,8 +1956,8 @@ entry: define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX6-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2133,9 +1973,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -2146,9 +1985,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2157,9 +1995,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2167,8 +2004,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2185,9 +2022,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2196,9 +2032,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -2207,9 +2042,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -2218,9 +2052,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -2229,8 +2062,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2239,8 +2072,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2249,8 +2082,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2259,8 +2092,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2274,8 +2107,8 @@ entry: define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2293,8 +2126,8 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2309,9 +2142,8 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2322,9 +2154,8 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2334,8 +2165,8 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2354,9 +2185,8 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2367,9 +2197,8 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2380,9 +2209,8 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2393,9 +2221,8 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2406,8 +2233,8 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2418,8 +2245,8 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2430,8 +2257,8 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2442,8 +2269,8 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2460,8 +2287,8 @@ entry: define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2479,8 +2306,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2495,9 +2322,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2508,9 +2334,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2520,8 +2345,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2540,9 +2365,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2553,9 +2377,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2566,9 +2389,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2579,9 +2401,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2592,8 +2413,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2604,8 +2425,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2616,8 +2437,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2628,8 +2449,8 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2646,8 +2467,8 @@ entry: define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2665,8 +2486,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2681,9 +2502,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2694,9 +2514,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2706,8 +2525,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2726,9 +2545,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2739,9 +2557,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2752,9 +2569,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2765,9 +2581,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2778,8 +2593,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2790,8 +2605,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2802,8 +2617,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2814,8 +2629,8 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2832,6 +2647,7 @@ entry: define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -2853,7 +2669,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -2878,7 +2694,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2893,7 +2708,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2908,6 +2722,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -2929,7 +2744,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2944,7 +2758,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -2959,7 +2772,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -2974,7 +2786,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -2989,7 +2800,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3004,7 +2814,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3019,8 +2828,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3035,8 +2842,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3058,6 +2863,7 @@ entry: define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3079,7 +2885,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3104,7 +2910,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3119,7 +2924,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3134,6 +2938,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3155,7 +2960,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3170,7 +2974,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3185,7 +2988,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3200,7 +3002,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3215,7 +3016,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3230,7 +3030,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3245,8 +3044,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3261,8 +3058,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3284,6 +3079,7 @@ entry: define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3305,7 +3101,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3330,7 +3126,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3345,7 +3140,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3360,6 +3154,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3381,7 +3176,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3396,7 +3190,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3411,7 +3204,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3426,7 +3218,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3441,7 +3232,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3456,7 +3246,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3471,8 +3260,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3487,8 +3274,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3510,6 +3295,7 @@ entry: define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3531,7 +3317,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3556,7 +3342,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3571,7 +3356,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3586,6 +3370,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3607,7 +3392,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3622,7 +3406,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3637,7 +3420,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3652,7 +3434,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3667,7 +3448,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3682,7 +3462,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3697,8 +3476,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3713,8 +3490,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3736,6 +3511,7 @@ entry: define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3757,7 +3533,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3782,7 +3558,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3797,7 +3572,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3812,6 +3586,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3833,7 +3608,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3848,7 +3622,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3863,7 +3636,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3878,7 +3650,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3893,7 +3664,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3908,7 +3678,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3923,8 +3692,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3939,8 +3706,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3962,6 +3727,7 @@ entry: define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3983,7 +3749,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4008,7 +3774,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4023,7 +3788,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4038,6 +3802,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4059,7 +3824,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4074,7 +3838,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4089,7 +3852,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4104,7 +3866,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4119,7 +3880,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4134,7 +3894,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4149,8 +3908,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4165,8 +3922,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4188,6 +3943,7 @@ entry: define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4209,7 +3965,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4234,7 +3990,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4249,7 +4004,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4264,6 +4018,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4285,7 +4040,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4300,7 +4054,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4315,7 +4068,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4330,7 +4082,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4345,7 +4096,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4360,7 +4110,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4375,8 +4124,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4391,8 +4138,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4414,6 +4159,7 @@ entry: define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4435,7 +4181,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4460,7 +4206,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4475,7 +4220,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4490,6 +4234,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4511,7 +4256,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4526,7 +4270,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4541,7 +4284,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4556,7 +4298,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4571,7 +4312,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4586,7 +4326,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4601,8 +4340,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4617,8 +4354,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4640,6 +4375,7 @@ entry: define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4661,7 +4397,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4686,7 +4422,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4701,7 +4436,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4716,6 +4450,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4737,7 +4472,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4752,7 +4486,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4767,7 +4500,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4782,7 +4514,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4797,7 +4528,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4812,7 +4542,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4827,8 +4556,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4843,8 +4570,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4866,6 +4591,7 @@ entry: define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4887,7 +4613,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4912,7 +4638,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4927,7 +4652,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4942,6 +4666,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4963,7 +4688,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4978,7 +4702,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4993,7 +4716,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5008,7 +4730,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5023,7 +4744,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5038,7 +4758,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5053,8 +4772,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5069,8 +4786,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5092,6 +4807,7 @@ entry: define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5113,7 +4829,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5138,7 +4854,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5153,7 +4868,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5168,6 +4882,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5189,7 +4904,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5204,7 +4918,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5219,7 +4932,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5234,7 +4946,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5249,7 +4960,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5264,7 +4974,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5279,8 +4988,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5295,8 +5002,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5318,6 +5023,7 @@ entry: define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5339,7 +5045,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5364,7 +5070,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5379,7 +5084,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5394,6 +5098,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5415,7 +5120,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5430,7 +5134,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5445,7 +5148,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5460,7 +5162,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5475,7 +5176,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5490,7 +5190,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5505,8 +5204,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5521,8 +5218,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5544,6 +5239,7 @@ entry: define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5565,7 +5261,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5590,7 +5286,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5605,7 +5300,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5620,6 +5314,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5641,7 +5336,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5656,7 +5350,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5671,7 +5364,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5686,7 +5378,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5701,7 +5392,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5716,7 +5406,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5731,8 +5420,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5747,8 +5434,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5770,6 +5455,7 @@ entry: define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5791,7 +5477,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5816,7 +5502,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5831,7 +5516,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5846,6 +5530,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5867,7 +5552,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5882,7 +5566,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5897,7 +5580,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5912,7 +5594,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5927,7 +5608,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5942,7 +5622,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5957,8 +5636,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5973,8 +5650,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5996,6 +5671,7 @@ entry: define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6017,7 +5693,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6042,7 +5718,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6057,7 +5732,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6072,6 +5746,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6093,7 +5768,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6108,7 +5782,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6123,7 +5796,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6138,7 +5810,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6153,7 +5824,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6168,7 +5838,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6183,8 +5852,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6199,8 +5866,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6222,6 +5887,7 @@ entry: define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6246,6 +5912,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6274,7 +5941,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6291,7 +5957,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6308,6 +5973,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6332,7 +5998,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6349,7 +6014,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6366,7 +6030,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6383,7 +6046,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6400,7 +6062,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6417,7 +6078,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6434,8 +6094,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6452,8 +6110,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6479,6 +6135,7 @@ entry: define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6503,6 +6160,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6531,7 +6189,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6548,7 +6205,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6565,6 +6221,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6589,7 +6246,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6606,7 +6262,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6623,7 +6278,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6640,7 +6294,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6657,7 +6310,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6674,7 +6326,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6691,8 +6342,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6709,8 +6358,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6736,6 +6383,7 @@ entry: define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6760,6 +6408,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6788,7 +6437,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6805,7 +6453,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6822,6 +6469,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6846,7 +6494,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6863,7 +6510,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6880,7 +6526,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6897,7 +6542,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6914,7 +6558,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6931,7 +6574,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6948,8 +6590,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6966,8 +6606,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6993,6 +6631,7 @@ entry: define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7017,6 +6656,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7045,7 +6685,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7062,7 +6701,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7079,6 +6717,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7103,7 +6742,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7120,7 +6758,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7137,7 +6774,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7154,7 +6790,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7171,7 +6806,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7188,7 +6822,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7205,8 +6838,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7223,8 +6854,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7250,6 +6879,7 @@ entry: define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7274,6 +6904,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7302,7 +6933,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7319,7 +6949,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7336,6 +6965,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7360,7 +6990,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7377,7 +7006,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7394,7 +7022,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7411,7 +7038,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7428,7 +7054,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7445,7 +7070,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7462,8 +7086,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7480,8 +7102,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7507,6 +7127,7 @@ entry: define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7531,6 +7152,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7559,7 +7181,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7576,7 +7197,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7593,6 +7213,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7617,7 +7238,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7634,7 +7254,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7651,7 +7270,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7668,7 +7286,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7685,7 +7302,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7702,7 +7318,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7719,8 +7334,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7737,8 +7350,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7764,6 +7375,7 @@ entry: define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7788,6 +7400,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7816,7 +7429,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7833,7 +7445,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7850,6 +7461,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7874,7 +7486,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7891,7 +7502,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7908,7 +7518,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7925,7 +7534,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7942,7 +7550,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7959,7 +7566,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7976,8 +7582,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7994,8 +7598,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8021,6 +7623,7 @@ entry: define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8045,6 +7648,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8073,7 +7677,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8090,7 +7693,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8107,6 +7709,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8131,7 +7734,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8148,7 +7750,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8165,7 +7766,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8182,7 +7782,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8199,7 +7798,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8216,7 +7814,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8233,8 +7830,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8251,8 +7846,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8278,6 +7871,7 @@ entry: define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8302,6 +7896,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8330,7 +7925,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8347,7 +7941,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8364,6 +7957,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8388,7 +7982,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8405,7 +7998,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8422,7 +8014,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8439,7 +8030,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8456,7 +8046,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8473,7 +8062,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8490,8 +8078,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8508,8 +8094,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8535,6 +8119,7 @@ entry: define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8559,6 +8144,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8587,7 +8173,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8604,7 +8189,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8621,6 +8205,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8645,7 +8230,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8662,7 +8246,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8679,7 +8262,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8696,7 +8278,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8713,7 +8294,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8730,7 +8310,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8747,8 +8326,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8765,8 +8342,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8792,6 +8367,7 @@ entry: define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8816,6 +8392,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8844,7 +8421,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8861,7 +8437,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8878,6 +8453,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8902,7 +8478,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8919,7 +8494,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8936,7 +8510,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8953,7 +8526,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8970,7 +8542,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8987,7 +8558,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9004,8 +8574,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9022,8 +8590,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9049,6 +8615,7 @@ entry: define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9073,6 +8640,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9101,7 +8669,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9118,7 +8685,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9135,6 +8701,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9159,7 +8726,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9176,7 +8742,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9193,7 +8758,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9210,7 +8774,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9227,7 +8790,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9244,7 +8806,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9261,8 +8822,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9279,8 +8838,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9306,6 +8863,7 @@ entry: define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9330,6 +8888,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9358,7 +8917,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9375,7 +8933,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9392,6 +8949,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9416,7 +8974,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9433,7 +8990,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9450,7 +9006,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9467,7 +9022,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9484,7 +9038,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9501,7 +9054,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9518,8 +9070,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9536,8 +9086,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9563,6 +9111,7 @@ entry: define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9587,6 +9136,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9615,7 +9165,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9632,7 +9181,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9649,6 +9197,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9673,7 +9222,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9690,7 +9238,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9707,7 +9254,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9724,7 +9270,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9741,7 +9286,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9758,7 +9302,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9775,8 +9318,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9793,8 +9334,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9820,6 +9359,7 @@ entry: define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9844,6 +9384,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9872,7 +9413,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9889,7 +9429,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9906,6 +9445,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9930,7 +9470,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9947,7 +9486,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9964,7 +9502,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9981,7 +9518,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9998,7 +9534,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10015,7 +9550,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10032,8 +9566,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10050,8 +9582,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10077,8 +9607,9 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; GFX6-LABEL: global_wavefront_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -10101,9 +9632,8 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX7-LABEL: global_wavefront_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10116,11 +9646,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10129,11 +9657,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX10-CU-LABEL: global_wavefront_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -10142,8 +9668,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10166,11 +9693,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10179,11 +9704,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10192,11 +9715,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10205,11 +9726,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10218,10 +9737,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10230,10 +9748,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX11-CU-LABEL: global_wavefront_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -10242,11 +9759,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10255,11 +9770,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX12-CU-LABEL: global_wavefront_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10275,8 +9788,9 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; GFX6-LABEL: global_wavefront_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -10299,9 +9813,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10314,11 +9827,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10327,11 +9838,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -10340,8 +9849,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10364,11 +9874,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10377,11 +9885,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10390,11 +9896,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10403,11 +9907,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10416,10 +9918,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10428,10 +9929,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -10440,11 +9940,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10453,11 +9951,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10473,8 +9969,9 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; GFX6-LABEL: global_wavefront_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -10497,9 +9994,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10512,11 +10008,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10525,11 +10019,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -10538,8 +10030,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10562,11 +10055,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10575,11 +10066,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10588,11 +10077,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10601,11 +10088,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10614,10 +10099,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10626,10 +10110,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -10638,11 +10121,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10651,11 +10132,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10671,8 +10150,9 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -10695,9 +10175,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10710,11 +10189,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10723,11 +10200,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -10736,8 +10211,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10760,11 +10236,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10773,11 +10247,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10786,11 +10258,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10799,11 +10269,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10812,10 +10280,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10824,10 +10291,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -10836,11 +10302,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -10849,11 +10313,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -10869,8 +10331,9 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX6-LABEL: global_wavefront_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10886,8 +10349,8 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX7-LABEL: global_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10897,10 +10360,8 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -10909,10 +10370,8 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX10-CU-LABEL: global_wavefront_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -10921,8 +10380,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -10938,10 +10398,8 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -10950,10 +10408,8 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -10962,10 +10418,8 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -10974,10 +10428,8 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -10986,9 +10438,8 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -10997,9 +10448,8 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX11-CU-LABEL: global_wavefront_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11008,10 +10458,8 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11020,10 +10468,8 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX12-CU-LABEL: global_wavefront_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11038,8 +10484,9 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX6-LABEL: global_wavefront_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11055,8 +10502,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11066,10 +10513,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -11078,10 +10523,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -11090,8 +10533,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11107,10 +10551,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11119,10 +10561,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11131,10 +10571,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11143,10 +10581,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11155,9 +10591,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11166,9 +10601,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11177,10 +10611,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11189,10 +10621,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11207,8 +10637,9 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX6-LABEL: global_wavefront_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11224,8 +10655,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX7-LABEL: global_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11235,10 +10666,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -11247,10 +10676,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -11259,8 +10686,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11276,10 +10704,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11288,10 +10714,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11300,10 +10724,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11312,10 +10734,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11324,9 +10744,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11335,9 +10754,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11346,10 +10764,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11358,10 +10774,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11376,8 +10790,9 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11393,8 +10808,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11404,10 +10819,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -11416,10 +10829,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -11428,8 +10839,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11445,10 +10857,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11457,10 +10867,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11469,10 +10877,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11481,10 +10887,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11493,9 +10897,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11504,9 +10907,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11515,10 +10917,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11527,10 +10927,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11545,8 +10943,8 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11562,9 +10960,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11575,9 +10972,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11586,9 +10982,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11596,8 +10991,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11614,9 +11009,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11625,9 +11019,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11636,9 +11029,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -11647,9 +11039,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -11658,8 +11049,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11668,8 +11059,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11678,8 +11069,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11688,8 +11079,8 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11703,8 +11094,8 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11720,9 +11111,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11733,9 +11123,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11744,9 +11133,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11754,8 +11142,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11772,9 +11160,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11783,9 +11170,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11794,9 +11180,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -11805,9 +11190,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -11816,8 +11200,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11826,8 +11210,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11836,8 +11220,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11846,8 +11230,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11861,8 +11245,8 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11878,9 +11262,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11891,9 +11274,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11902,9 +11284,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11912,8 +11293,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11930,9 +11311,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11941,9 +11321,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -11952,9 +11331,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -11963,9 +11341,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -11974,8 +11351,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11984,8 +11361,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -11994,8 +11371,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12004,8 +11381,8 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12019,8 +11396,8 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12036,9 +11413,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12049,9 +11425,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12060,9 +11435,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12070,8 +11444,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12088,9 +11462,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12099,9 +11472,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12110,9 +11482,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12121,9 +11492,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12132,8 +11502,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12142,8 +11512,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12152,8 +11522,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12162,8 +11532,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12177,8 +11547,8 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12194,9 +11564,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12207,9 +11576,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12218,9 +11586,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12228,8 +11595,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12246,9 +11613,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12257,9 +11623,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12268,9 +11633,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12279,9 +11643,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12290,8 +11653,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12300,8 +11663,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12310,8 +11673,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12320,8 +11683,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12335,8 +11698,8 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12354,8 +11717,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12370,9 +11733,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12383,9 +11745,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12395,8 +11756,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12415,9 +11776,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12428,9 +11788,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12441,9 +11800,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -12454,9 +11812,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -12467,8 +11824,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -12479,8 +11836,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -12491,8 +11848,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -12503,8 +11860,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -12521,8 +11878,8 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12540,8 +11897,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12556,9 +11913,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12569,9 +11925,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12581,8 +11936,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12601,9 +11956,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12614,9 +11968,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12627,9 +11980,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -12640,9 +11992,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -12653,8 +12004,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -12665,8 +12016,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -12677,8 +12028,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -12689,8 +12040,8 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -12707,8 +12058,8 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12726,8 +12077,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -12742,9 +12093,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12755,9 +12105,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12767,8 +12116,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12787,9 +12136,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12800,9 +12148,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -12813,9 +12160,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -12826,9 +12172,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -12839,8 +12184,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -12851,8 +12196,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -12863,8 +12208,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -12875,8 +12220,8 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -12893,6 +12238,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -12914,7 +12260,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -12939,7 +12285,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12954,7 +12299,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -12969,6 +12313,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -12990,7 +12335,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13005,7 +12349,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13020,7 +12363,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13035,7 +12377,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13050,7 +12391,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13065,7 +12405,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13080,8 +12419,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13096,8 +12433,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13119,6 +12454,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -13140,7 +12476,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13165,7 +12501,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13180,7 +12515,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13195,6 +12529,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -13216,7 +12551,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13231,7 +12565,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13246,7 +12579,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13261,7 +12593,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13276,7 +12607,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13291,7 +12621,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13306,8 +12635,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13322,8 +12649,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13345,6 +12670,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -13366,7 +12692,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13391,7 +12717,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13406,7 +12731,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13421,6 +12745,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -13442,7 +12767,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13457,7 +12781,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13472,7 +12795,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13487,7 +12809,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13502,7 +12823,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13517,7 +12837,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13532,8 +12851,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13548,8 +12865,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13571,6 +12886,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -13592,7 +12908,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13617,7 +12933,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13632,7 +12947,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13647,6 +12961,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -13668,7 +12983,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13683,7 +12997,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13698,7 +13011,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13713,7 +13025,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13728,7 +13039,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13743,7 +13053,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13758,8 +13067,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13774,8 +13081,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13797,6 +13102,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -13818,7 +13124,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13843,7 +13149,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13858,7 +13163,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13873,6 +13177,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -13894,7 +13199,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13909,7 +13213,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13924,7 +13227,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13939,7 +13241,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13954,7 +13255,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13969,7 +13269,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13984,8 +13283,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14000,8 +13297,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14023,6 +13318,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14044,7 +13340,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14069,7 +13365,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14084,7 +13379,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14099,6 +13393,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14120,7 +13415,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14135,7 +13429,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14150,7 +13443,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14165,7 +13457,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14180,7 +13471,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14195,7 +13485,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14210,8 +13499,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14226,8 +13513,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14249,6 +13534,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14270,7 +13556,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14295,7 +13581,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14310,7 +13595,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14325,6 +13609,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14346,7 +13631,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14361,7 +13645,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14376,7 +13659,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14391,7 +13673,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14406,7 +13687,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14421,7 +13701,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14436,8 +13715,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14452,8 +13729,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14475,6 +13750,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14496,7 +13772,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14521,7 +13797,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14536,7 +13811,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14551,6 +13825,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14572,7 +13847,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14587,7 +13861,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14602,7 +13875,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14617,7 +13889,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14632,7 +13903,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14647,7 +13917,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14662,8 +13931,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14678,8 +13945,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14701,6 +13966,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14722,7 +13988,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14747,7 +14013,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14762,7 +14027,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14777,6 +14041,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14798,7 +14063,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14813,7 +14077,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14828,7 +14091,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14843,7 +14105,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14858,7 +14119,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14873,7 +14133,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14888,8 +14147,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14904,8 +14161,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14927,6 +14182,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14948,7 +14204,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14973,7 +14229,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14988,7 +14243,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15003,6 +14257,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15024,7 +14279,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15039,7 +14293,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15054,7 +14307,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15069,7 +14321,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15084,7 +14335,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15099,7 +14349,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15114,8 +14363,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15130,8 +14377,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15153,6 +14398,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15174,7 +14420,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15199,7 +14445,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15214,7 +14459,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15229,6 +14473,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15250,7 +14495,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15265,7 +14509,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15280,7 +14523,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15295,7 +14537,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15310,7 +14551,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15325,7 +14565,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15340,8 +14579,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15356,8 +14593,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15379,6 +14614,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15400,7 +14636,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15425,7 +14661,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15440,7 +14675,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15455,6 +14689,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15476,7 +14711,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15491,7 +14725,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15506,7 +14739,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15521,7 +14753,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15536,7 +14767,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15551,7 +14781,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15566,8 +14795,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15582,8 +14809,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15605,6 +14830,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15626,7 +14852,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15651,7 +14877,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15666,7 +14891,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15681,6 +14905,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15702,7 +14927,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15717,7 +14941,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15732,7 +14955,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15747,7 +14969,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15762,7 +14983,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15777,7 +14997,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15792,8 +15011,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15808,8 +15025,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15831,6 +15046,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15852,7 +15068,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15877,7 +15093,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15892,7 +15107,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15907,6 +15121,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15928,7 +15143,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15943,7 +15157,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15958,7 +15171,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15973,7 +15185,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15988,7 +15199,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16003,7 +15213,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16018,8 +15227,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16034,8 +15241,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16057,6 +15262,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16078,7 +15284,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16103,7 +15309,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16118,7 +15323,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16133,6 +15337,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16154,7 +15359,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16169,7 +15373,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16184,7 +15387,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16199,7 +15401,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16214,7 +15415,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16229,7 +15429,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16244,8 +15443,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16260,8 +15457,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16283,6 +15478,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16307,6 +15503,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16335,7 +15532,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16352,7 +15548,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16369,6 +15564,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16393,7 +15589,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16410,7 +15605,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16427,7 +15621,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16444,7 +15637,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16461,7 +15653,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16478,7 +15669,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16495,8 +15685,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16513,8 +15701,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16540,6 +15726,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16564,6 +15751,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16592,7 +15780,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16609,7 +15796,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16626,6 +15812,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16650,7 +15837,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16667,7 +15853,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16684,7 +15869,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16701,7 +15885,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16718,7 +15901,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16735,7 +15917,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16752,8 +15933,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16770,8 +15949,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16797,6 +15974,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16821,6 +15999,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16849,7 +16028,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16866,7 +16044,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16883,6 +16060,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16907,7 +16085,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16924,7 +16101,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16941,7 +16117,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16958,7 +16133,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16975,7 +16149,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16992,7 +16165,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17009,8 +16181,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17027,8 +16197,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17054,6 +16222,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17078,6 +16247,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17106,7 +16276,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17123,7 +16292,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17140,6 +16308,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17164,7 +16333,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17181,7 +16349,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17198,7 +16365,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17215,7 +16381,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17232,7 +16397,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17249,7 +16413,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17266,8 +16429,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17284,8 +16445,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17311,6 +16470,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17335,6 +16495,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17363,7 +16524,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17380,7 +16540,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17397,6 +16556,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17421,7 +16581,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17438,7 +16597,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17455,7 +16613,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17472,7 +16629,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17489,7 +16645,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17506,7 +16661,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17523,8 +16677,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17541,8 +16693,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17568,6 +16718,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17592,6 +16743,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17620,7 +16772,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17637,7 +16788,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17654,6 +16804,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17678,7 +16829,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17695,7 +16845,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17712,7 +16861,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17729,7 +16877,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17746,7 +16893,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17763,7 +16909,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17780,8 +16925,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17798,8 +16941,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17825,6 +16966,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17849,6 +16991,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17877,7 +17020,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17894,7 +17036,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17911,6 +17052,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17935,7 +17077,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17952,7 +17093,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17969,7 +17109,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17986,7 +17125,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18003,7 +17141,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18020,7 +17157,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18037,8 +17173,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18055,8 +17189,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18082,6 +17214,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18106,6 +17239,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18134,7 +17268,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18151,7 +17284,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18168,6 +17300,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18192,7 +17325,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18209,7 +17341,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18226,7 +17357,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18243,7 +17373,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18260,7 +17389,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18277,7 +17405,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18294,8 +17421,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18312,8 +17437,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18339,6 +17462,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18363,6 +17487,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18391,7 +17516,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18408,7 +17532,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18425,6 +17548,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18449,7 +17573,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18466,7 +17589,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18483,7 +17605,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18500,7 +17621,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18517,7 +17637,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18534,7 +17653,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18551,8 +17669,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18569,8 +17685,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18596,6 +17710,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18620,6 +17735,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18648,7 +17764,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18665,7 +17780,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18682,6 +17796,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18706,7 +17821,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18723,7 +17837,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18740,7 +17853,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18757,7 +17869,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18774,7 +17885,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18791,7 +17901,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18808,8 +17917,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18826,8 +17933,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18853,6 +17958,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18877,6 +17983,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18905,7 +18012,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18922,7 +18028,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18939,6 +18044,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18963,7 +18069,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18980,7 +18085,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18997,7 +18101,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19014,7 +18117,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19031,7 +18133,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19048,7 +18149,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19065,8 +18165,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19083,8 +18181,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19110,6 +18206,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19134,6 +18231,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19162,7 +18260,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19179,7 +18276,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19196,6 +18292,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19220,7 +18317,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19237,7 +18333,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19254,7 +18349,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19271,7 +18365,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19288,7 +18381,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19305,7 +18397,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19322,8 +18413,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19340,8 +18429,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19367,6 +18454,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19391,6 +18479,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19419,7 +18508,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19436,7 +18524,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19453,6 +18540,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19477,7 +18565,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19494,7 +18581,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19511,7 +18597,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19528,7 +18613,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19545,7 +18629,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19562,7 +18645,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19579,8 +18661,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19597,8 +18677,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19624,6 +18702,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19648,6 +18727,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19676,7 +18756,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19693,7 +18772,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19710,6 +18788,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19734,7 +18813,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19751,7 +18829,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19768,7 +18845,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19785,7 +18861,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19802,7 +18877,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19819,7 +18893,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19836,8 +18909,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19854,8 +18925,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19881,6 +18950,7 @@ entry: define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19905,6 +18975,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19933,7 +19004,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19950,7 +19020,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19967,6 +19036,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19991,7 +19061,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20008,7 +19077,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20025,7 +19093,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20042,7 +19109,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20059,7 +19125,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20076,7 +19141,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20093,8 +19157,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20111,8 +19173,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 4a5d215bcede68..0edf543f33f480 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -16,8 +16,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX6-LABEL: global_workgroup_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -40,9 +41,8 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX7-LABEL: global_workgroup_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -55,11 +55,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX10-WGP-LABEL: global_workgroup_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -68,11 +66,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX10-CU-LABEL: global_workgroup_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -81,8 +77,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -105,11 +102,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -118,11 +113,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -131,11 +124,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -144,11 +135,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -157,10 +146,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX11-WGP-LABEL: global_workgroup_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -169,10 +157,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX11-CU-LABEL: global_workgroup_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -181,11 +168,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX12-WGP-LABEL: global_workgroup_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -194,11 +179,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX12-CU-LABEL: global_workgroup_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -214,8 +197,9 @@ entry: define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX6-LABEL: global_workgroup_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -238,9 +222,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX7-LABEL: global_workgroup_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -253,11 +236,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -266,11 +247,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -279,8 +258,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -303,11 +283,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -316,11 +294,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -329,11 +305,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -342,11 +316,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -355,10 +327,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -367,10 +338,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -379,11 +349,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -392,11 +360,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -412,8 +378,9 @@ entry: define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX6-LABEL: global_workgroup_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -436,9 +403,8 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX7-LABEL: global_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -451,11 +417,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -465,11 +429,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX10-CU-LABEL: global_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -478,8 +440,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -502,11 +465,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -515,11 +476,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -529,11 +488,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -542,11 +499,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -556,10 +511,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -569,10 +523,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX11-CU-LABEL: global_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -581,11 +534,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -595,11 +546,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX12-CU-LABEL: global_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -615,8 +564,9 @@ entry: define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX6-LABEL: global_workgroup_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -640,9 +590,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX7-LABEL: global_workgroup_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -656,11 +605,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc @@ -671,11 +618,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -684,8 +629,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -709,11 +655,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -722,11 +666,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -736,11 +678,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -749,11 +689,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -763,10 +701,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc @@ -777,10 +714,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -789,11 +725,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0 @@ -809,11 +743,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] @@ -830,8 +762,9 @@ entry: define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX6-LABEL: global_workgroup_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -847,8 +780,8 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX7-LABEL: global_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -858,10 +791,8 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX10-WGP-LABEL: global_workgroup_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -870,10 +801,8 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX10-CU-LABEL: global_workgroup_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -882,8 +811,9 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -899,10 +829,8 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -911,10 +839,8 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -923,10 +849,8 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -935,10 +859,8 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -947,9 +869,8 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX11-WGP-LABEL: global_workgroup_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -958,9 +879,8 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX11-CU-LABEL: global_workgroup_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -969,10 +889,8 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX12-WGP-LABEL: global_workgroup_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -981,10 +899,8 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX12-CU-LABEL: global_workgroup_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -999,8 +915,9 @@ entry: define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX6-LABEL: global_workgroup_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1016,8 +933,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX7-LABEL: global_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1027,10 +944,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1039,10 +954,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1051,8 +964,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1068,10 +982,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1080,10 +992,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1092,10 +1002,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1104,10 +1012,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1116,9 +1022,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1127,9 +1032,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1138,10 +1042,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1150,10 +1052,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1168,8 +1068,9 @@ entry: define amdgpu_kernel void @global_workgroup_release_store( ; GFX6-LABEL: global_workgroup_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1186,8 +1087,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX7-LABEL: global_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1198,10 +1099,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX10-WGP-LABEL: global_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1212,10 +1111,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX10-CU-LABEL: global_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1225,8 +1122,9 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1243,10 +1141,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1256,10 +1152,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1269,10 +1163,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1282,10 +1174,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1295,9 +1185,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX11-WGP-LABEL: global_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1308,9 +1197,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX11-CU-LABEL: global_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1320,10 +1208,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX12-WGP-LABEL: global_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1336,10 +1222,8 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX12-CU-LABEL: global_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1355,8 +1239,9 @@ entry: define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX6-LABEL: global_workgroup_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1373,8 +1258,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX7-LABEL: global_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1385,10 +1270,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -1399,10 +1282,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -1412,8 +1293,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1430,10 +1312,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1443,10 +1323,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1456,10 +1334,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1469,10 +1345,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1482,9 +1356,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1495,9 +1368,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1507,10 +1379,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1523,10 +1393,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1542,8 +1410,8 @@ entry: define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX6-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1559,9 +1427,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX7-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1572,9 +1439,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1583,9 +1449,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1593,8 +1458,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1611,9 +1476,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1622,9 +1486,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1633,9 +1496,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1644,9 +1506,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1655,8 +1516,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1665,8 +1526,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1675,8 +1536,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -1685,8 +1546,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1700,8 +1561,8 @@ entry: define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX6-LABEL: global_workgroup_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1717,9 +1578,8 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1730,9 +1590,8 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1743,9 +1602,8 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1753,8 +1611,8 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1771,9 +1629,8 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1782,9 +1639,8 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -1795,9 +1651,8 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1806,9 +1661,8 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -1819,8 +1673,8 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1831,8 +1685,8 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1841,8 +1695,8 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -1853,8 +1707,8 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1868,8 +1722,8 @@ entry: define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX6-LABEL: global_workgroup_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -1886,9 +1740,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX7-LABEL: global_workgroup_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -1900,9 +1753,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1913,9 +1765,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1924,8 +1775,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1943,9 +1794,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1955,9 +1805,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1967,9 +1816,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1979,9 +1827,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1991,8 +1838,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2003,8 +1850,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2014,8 +1861,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -2028,8 +1875,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -2044,8 +1891,8 @@ entry: define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX6-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2062,9 +1909,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -2076,9 +1922,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2091,9 +1936,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2102,8 +1946,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2121,9 +1965,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2133,9 +1976,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2147,9 +1989,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2159,9 +2000,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2173,8 +2013,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2187,8 +2027,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2198,8 +2038,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -2214,8 +2054,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -2230,8 +2070,8 @@ entry: define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX6-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2248,9 +2088,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -2262,9 +2101,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2277,9 +2115,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2288,8 +2125,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2307,9 +2144,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2319,9 +2155,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2333,9 +2168,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2345,9 +2179,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2359,8 +2192,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2373,8 +2206,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2384,8 +2217,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -2400,8 +2233,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -2416,8 +2249,8 @@ entry: define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2435,8 +2268,8 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2451,9 +2284,8 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2465,9 +2297,8 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2477,8 +2308,8 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2497,9 +2328,8 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2510,9 +2340,8 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -2524,9 +2353,8 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2537,9 +2365,8 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -2551,8 +2378,8 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2564,8 +2391,8 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2576,8 +2403,8 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE @@ -2589,8 +2416,8 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -2607,8 +2434,8 @@ entry: define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2627,8 +2454,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2644,9 +2471,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2660,9 +2486,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2673,8 +2498,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2694,9 +2519,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2708,9 +2532,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2723,9 +2546,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2737,9 +2559,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2752,8 +2573,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2767,8 +2588,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2780,8 +2601,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -2799,8 +2620,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -2818,8 +2639,8 @@ entry: define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -2838,8 +2659,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -2855,9 +2676,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2871,9 +2691,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2884,8 +2703,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -2905,9 +2724,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2919,9 +2737,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2934,9 +2751,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2948,9 +2764,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2963,8 +2778,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2978,8 +2793,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2991,8 +2806,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -3010,8 +2825,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -3029,6 +2844,7 @@ entry: define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3050,7 +2866,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3075,7 +2891,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3090,7 +2905,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3105,6 +2919,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3126,7 +2941,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3141,7 +2955,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3156,7 +2969,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3171,7 +2983,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3186,7 +2997,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3201,7 +3011,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3216,8 +3025,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3232,8 +3039,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3255,6 +3060,7 @@ entry: define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3276,7 +3082,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3301,7 +3107,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3318,7 +3123,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3333,6 +3137,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3354,7 +3159,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3369,7 +3173,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3386,7 +3189,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3401,7 +3203,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3418,7 +3219,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3435,7 +3235,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3450,8 +3249,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3468,8 +3265,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3491,6 +3286,7 @@ entry: define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3513,7 +3309,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3539,7 +3335,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3556,7 +3351,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3572,6 +3366,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3594,7 +3389,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3610,7 +3404,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3626,7 +3419,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3642,7 +3434,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3658,7 +3449,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3675,7 +3465,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3691,8 +3480,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3711,8 +3498,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3735,6 +3520,7 @@ entry: define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -3757,7 +3543,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -3783,7 +3569,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3802,7 +3587,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3818,6 +3602,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -3840,7 +3625,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3856,7 +3640,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -3874,7 +3657,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3890,7 +3672,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -3908,7 +3689,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3927,7 +3707,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3943,8 +3722,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3965,8 +3742,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3989,6 +3764,7 @@ entry: define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4011,7 +3787,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4037,7 +3813,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4056,7 +3831,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4072,6 +3846,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4094,7 +3869,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4110,7 +3884,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4128,7 +3901,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4144,7 +3916,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4162,7 +3933,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4181,7 +3951,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4197,8 +3966,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4219,8 +3986,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4243,6 +4008,7 @@ entry: define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4264,7 +4030,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4289,7 +4055,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4306,7 +4071,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4321,6 +4085,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4342,7 +4107,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4357,7 +4121,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4374,7 +4137,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4389,7 +4151,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4406,7 +4167,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4423,7 +4183,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4438,8 +4197,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4456,8 +4213,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4479,6 +4234,7 @@ entry: define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4500,7 +4256,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4525,7 +4281,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4542,7 +4297,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4557,6 +4311,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4578,7 +4333,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4593,7 +4347,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4610,7 +4363,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4625,7 +4377,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4642,7 +4393,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4659,7 +4409,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4674,8 +4423,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4692,8 +4439,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4715,6 +4460,7 @@ entry: define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4737,7 +4483,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -4763,7 +4509,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4782,7 +4527,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4798,6 +4542,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -4820,7 +4565,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4836,7 +4580,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -4854,7 +4597,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4870,7 +4612,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -4888,7 +4629,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4907,7 +4647,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4923,8 +4662,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4945,8 +4682,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4969,6 +4704,7 @@ entry: define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -4991,7 +4727,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5017,7 +4753,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5036,7 +4771,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5052,6 +4786,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5074,7 +4809,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5090,7 +4824,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5108,7 +4841,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5124,7 +4856,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5142,7 +4873,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5161,7 +4891,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5177,8 +4906,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5199,8 +4926,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5223,6 +4948,7 @@ entry: define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5245,7 +4971,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5271,7 +4997,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5290,7 +5015,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5306,6 +5030,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5328,7 +5053,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5344,7 +5068,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5362,7 +5085,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5378,7 +5100,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5396,7 +5117,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5415,7 +5135,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5431,8 +5150,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5453,8 +5170,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5477,6 +5192,7 @@ entry: define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5499,7 +5215,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5525,7 +5241,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5544,7 +5259,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5560,6 +5274,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5582,7 +5297,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5598,7 +5312,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5616,7 +5329,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5632,7 +5344,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5650,7 +5361,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5669,7 +5379,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5685,8 +5394,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5707,8 +5414,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5731,6 +5436,7 @@ entry: define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -5753,7 +5459,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -5779,7 +5485,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5798,7 +5503,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5814,6 +5518,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -5836,7 +5541,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5852,7 +5556,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -5870,7 +5573,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5886,7 +5588,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -5904,7 +5605,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5923,7 +5623,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5939,8 +5638,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5961,8 +5658,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5985,6 +5680,7 @@ entry: define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6007,7 +5703,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6033,7 +5729,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6052,7 +5747,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6068,6 +5762,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6090,7 +5785,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6106,7 +5800,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6124,7 +5817,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6140,7 +5832,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6158,7 +5849,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6177,7 +5867,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6193,8 +5882,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6215,8 +5902,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6239,6 +5924,7 @@ entry: define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6261,7 +5947,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6287,7 +5973,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6306,7 +5991,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6322,6 +6006,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6344,7 +6029,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6360,7 +6044,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6378,7 +6061,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6394,7 +6076,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6412,7 +6093,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6431,7 +6111,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6447,8 +6126,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6469,8 +6146,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6493,6 +6168,7 @@ entry: define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6515,7 +6191,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -6541,7 +6217,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6560,7 +6235,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6576,6 +6250,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6598,7 +6273,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6614,7 +6288,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6632,7 +6305,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6648,7 +6320,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6666,7 +6337,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6685,7 +6355,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6701,8 +6370,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6723,8 +6390,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6747,6 +6412,7 @@ entry: define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6771,6 +6437,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -6799,7 +6466,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6816,7 +6482,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6833,6 +6498,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -6857,7 +6523,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6874,7 +6539,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -6891,7 +6555,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6908,7 +6571,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -6925,7 +6587,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6942,7 +6603,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6959,8 +6619,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6977,8 +6635,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7004,6 +6660,7 @@ entry: define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7028,6 +6685,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7056,7 +6714,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7074,7 +6731,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7091,6 +6747,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7115,7 +6772,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7132,7 +6788,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7150,7 +6805,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7167,7 +6821,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7185,7 +6838,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7203,7 +6855,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7220,8 +6871,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7239,8 +6888,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7266,6 +6913,7 @@ entry: define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7291,6 +6939,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7320,7 +6969,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7339,7 +6987,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7357,6 +7004,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7382,7 +7030,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7400,7 +7047,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7418,7 +7064,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7436,7 +7081,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7454,7 +7098,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7473,7 +7116,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7491,8 +7133,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7513,8 +7153,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7541,6 +7179,7 @@ entry: define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7566,6 +7205,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7595,7 +7235,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7615,7 +7254,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7633,6 +7271,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7658,7 +7297,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7676,7 +7314,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7695,7 +7332,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7713,7 +7349,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7732,7 +7367,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7752,7 +7386,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7770,8 +7403,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7795,8 +7426,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7823,6 +7452,7 @@ entry: define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7848,6 +7478,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -7877,7 +7508,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7897,7 +7527,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7915,6 +7544,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -7940,7 +7570,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7958,7 +7587,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -7977,7 +7605,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -7995,7 +7622,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8014,7 +7640,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8034,7 +7659,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8052,8 +7676,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8077,8 +7699,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8105,6 +7725,7 @@ entry: define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8129,6 +7750,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8157,7 +7779,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8175,7 +7796,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8192,6 +7812,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8216,7 +7837,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8233,7 +7853,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8251,7 +7870,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8268,7 +7886,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8286,7 +7903,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8304,7 +7920,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8321,8 +7936,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8342,8 +7955,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8369,6 +7980,7 @@ entry: define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8393,6 +8005,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8421,7 +8034,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8439,7 +8051,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8456,6 +8067,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8480,7 +8092,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8497,7 +8108,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8515,7 +8125,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8532,7 +8141,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8550,7 +8158,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8568,7 +8175,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8585,8 +8191,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8604,8 +8208,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8631,6 +8233,7 @@ entry: define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8656,6 +8259,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8685,7 +8289,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8705,7 +8308,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8723,6 +8325,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -8748,7 +8351,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8766,7 +8368,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8785,7 +8386,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8803,7 +8403,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -8822,7 +8421,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8842,7 +8440,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8860,8 +8457,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8885,8 +8480,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8913,6 +8506,7 @@ entry: define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8938,6 +8532,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -8967,7 +8562,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -8987,7 +8581,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9005,6 +8598,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9030,7 +8624,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9048,7 +8641,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9067,7 +8659,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9085,7 +8676,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9104,7 +8694,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9124,7 +8713,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9142,8 +8730,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9167,8 +8753,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9195,6 +8779,7 @@ entry: define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9220,6 +8805,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9249,7 +8835,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9269,7 +8854,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9287,6 +8871,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9312,7 +8897,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9330,7 +8914,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9349,7 +8932,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9367,7 +8949,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9386,7 +8967,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9406,7 +8986,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9424,8 +9003,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9449,8 +9026,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9477,6 +9052,7 @@ entry: define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9502,6 +9078,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9531,7 +9108,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9551,7 +9127,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9569,6 +9144,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9594,7 +9170,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9612,7 +9187,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9631,7 +9205,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9649,7 +9222,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9668,7 +9240,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9688,7 +9259,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9706,8 +9276,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9731,8 +9299,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9759,6 +9325,7 @@ entry: define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9784,6 +9351,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -9813,7 +9381,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9833,7 +9400,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9851,6 +9417,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -9876,7 +9443,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9894,7 +9460,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -9913,7 +9478,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9931,7 +9495,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -9950,7 +9513,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9970,7 +9532,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9988,8 +9549,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10011,8 +9570,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10039,6 +9596,7 @@ entry: define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10064,6 +9622,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10093,7 +9652,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10113,7 +9671,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10131,6 +9688,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10156,7 +9714,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10174,7 +9731,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10193,7 +9749,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10211,7 +9766,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10230,7 +9784,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10250,7 +9803,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10268,8 +9820,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10293,8 +9843,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10321,6 +9869,7 @@ entry: define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10346,6 +9895,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10375,7 +9925,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10395,7 +9944,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10413,6 +9961,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10438,7 +9987,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10456,7 +10004,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10475,7 +10022,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10493,7 +10039,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10512,7 +10057,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10532,7 +10076,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10550,8 +10093,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10575,8 +10116,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10603,6 +10142,7 @@ entry: define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10628,6 +10168,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -10657,7 +10198,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10677,7 +10217,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10695,6 +10234,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -10720,7 +10260,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10738,7 +10277,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -10757,7 +10295,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10775,7 +10312,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -10794,7 +10330,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10814,7 +10349,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10832,8 +10366,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10857,8 +10389,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10885,8 +10415,9 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; GFX6-LABEL: global_workgroup_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -10909,9 +10440,8 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX7-LABEL: global_workgroup_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -10924,11 +10454,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -10937,11 +10465,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX10-CU-LABEL: global_workgroup_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -10950,8 +10476,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -10974,11 +10501,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10987,11 +10512,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11000,11 +10523,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11013,11 +10534,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11026,10 +10545,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -11038,10 +10556,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX11-CU-LABEL: global_workgroup_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -11050,11 +10567,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11063,11 +10578,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX12-CU-LABEL: global_workgroup_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11083,8 +10596,9 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX6-LABEL: global_workgroup_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -11107,9 +10621,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11122,11 +10635,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -11135,11 +10646,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -11148,8 +10657,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11172,11 +10682,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11185,11 +10693,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11198,11 +10704,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11211,11 +10715,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11224,10 +10726,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -11236,10 +10737,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -11248,11 +10748,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11261,11 +10759,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11281,8 +10777,9 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX6-LABEL: global_workgroup_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -11305,9 +10802,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11320,11 +10816,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -11334,11 +10828,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -11347,8 +10839,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11371,11 +10864,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11384,11 +10875,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11398,11 +10887,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11411,11 +10898,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11425,10 +10910,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -11438,10 +10922,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -11450,11 +10933,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11464,11 +10945,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11484,8 +10963,9 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, s9 ; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 @@ -11508,9 +10988,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11523,11 +11002,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc @@ -11538,11 +11015,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -11551,8 +11026,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11575,11 +11051,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11588,11 +11062,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11602,11 +11074,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11615,11 +11085,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11629,10 +11097,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc @@ -11643,10 +11110,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -11655,11 +11121,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 @@ -11675,11 +11139,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 @@ -11695,8 +11157,9 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX6-LABEL: global_workgroup_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11712,8 +11175,8 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX7-LABEL: global_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11723,10 +11186,8 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -11735,10 +11196,8 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX10-CU-LABEL: global_workgroup_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -11747,8 +11206,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11764,10 +11224,8 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11776,10 +11234,8 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11788,10 +11244,8 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11800,10 +11254,8 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11812,9 +11264,8 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11823,9 +11274,8 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX11-CU-LABEL: global_workgroup_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11834,10 +11284,8 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11846,10 +11294,8 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX12-CU-LABEL: global_workgroup_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -11864,8 +11310,9 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX6-LABEL: global_workgroup_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -11881,8 +11328,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -11892,10 +11339,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -11904,10 +11349,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -11916,8 +11359,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -11933,10 +11377,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11945,10 +11387,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -11957,10 +11397,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11969,10 +11407,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -11981,9 +11417,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -11992,9 +11427,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -12003,10 +11437,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -12015,10 +11447,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -12033,8 +11463,9 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX6-LABEL: global_workgroup_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12050,8 +11481,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX7-LABEL: global_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12061,10 +11492,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -12075,10 +11504,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -12087,8 +11514,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12104,10 +11532,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -12116,10 +11542,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -12129,10 +11553,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -12141,10 +11563,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -12154,9 +11574,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -12167,9 +11586,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -12178,10 +11596,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -12194,10 +11610,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -12212,8 +11626,9 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12229,8 +11644,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12240,10 +11655,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -12254,10 +11667,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -12266,8 +11677,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12283,10 +11695,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -12295,10 +11705,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -12308,10 +11716,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -12320,10 +11726,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -12333,9 +11737,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -12346,9 +11749,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -12357,10 +11759,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -12373,10 +11773,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -12391,8 +11789,8 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12408,9 +11806,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12421,9 +11818,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12432,9 +11828,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12442,8 +11837,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12460,9 +11855,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12471,9 +11865,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12482,9 +11875,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12493,9 +11885,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12504,8 +11895,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12514,8 +11905,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12524,8 +11915,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -12534,8 +11925,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12549,8 +11940,8 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12566,9 +11957,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12579,9 +11969,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12592,9 +11981,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12602,8 +11990,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12620,9 +12008,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12631,9 +12018,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12644,9 +12030,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12655,9 +12040,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12668,8 +12052,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12680,8 +12064,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12690,8 +12074,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -12702,8 +12086,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12717,8 +12101,8 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12734,9 +12118,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12747,9 +12130,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -12760,9 +12142,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12770,8 +12151,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12788,9 +12169,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12799,9 +12179,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12811,9 +12190,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12822,9 +12200,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12834,8 +12211,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -12846,8 +12223,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12856,8 +12233,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -12870,8 +12247,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -12885,8 +12262,8 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -12902,9 +12279,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -12915,9 +12291,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -12930,9 +12305,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12940,8 +12314,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -12958,9 +12332,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -12969,9 +12342,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12983,9 +12355,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -12994,9 +12365,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13008,8 +12378,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13022,8 +12392,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -13032,8 +12402,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -13048,8 +12418,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -13063,8 +12433,8 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -13080,9 +12450,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 @@ -13093,9 +12462,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13108,9 +12476,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -13118,8 +12485,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -13136,9 +12503,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] @@ -13147,9 +12513,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13161,9 +12526,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] @@ -13172,9 +12536,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13186,8 +12549,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13200,8 +12563,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -13210,8 +12573,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -13226,8 +12589,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -13241,8 +12604,8 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -13260,8 +12623,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -13276,9 +12639,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -13290,9 +12652,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -13302,8 +12663,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -13322,9 +12683,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -13335,9 +12695,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -13349,9 +12708,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -13362,9 +12720,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -13376,8 +12733,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -13389,8 +12746,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -13401,8 +12758,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE @@ -13414,8 +12771,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -13432,8 +12789,8 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -13451,8 +12808,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -13467,9 +12824,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13483,9 +12839,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -13495,8 +12850,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -13515,9 +12870,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -13528,9 +12882,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13543,9 +12896,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -13556,9 +12908,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13571,8 +12922,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13586,8 +12937,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -13598,8 +12949,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -13617,8 +12968,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -13635,8 +12986,8 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -13654,8 +13005,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -13670,9 +13021,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13686,9 +13036,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -13698,8 +13047,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -13718,9 +13067,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc @@ -13731,9 +13079,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13746,9 +13093,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 @@ -13759,9 +13105,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13774,8 +13119,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -13789,8 +13134,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -13801,8 +13146,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -13820,8 +13165,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN @@ -13838,6 +13183,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -13859,7 +13205,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -13884,7 +13230,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13899,7 +13244,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13914,6 +13258,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -13935,7 +13280,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13950,7 +13294,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -13965,7 +13308,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13980,7 +13322,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -13995,7 +13336,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14010,7 +13350,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14025,8 +13364,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14041,8 +13378,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14064,6 +13399,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14085,7 +13421,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14110,7 +13446,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14127,7 +13462,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14142,6 +13476,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14163,7 +13498,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14178,7 +13512,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14195,7 +13528,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14210,7 +13542,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14227,7 +13558,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14244,7 +13574,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14259,8 +13588,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14277,8 +13604,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14300,6 +13625,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14321,7 +13647,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14346,7 +13672,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14363,7 +13688,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14378,6 +13702,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14399,7 +13724,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14414,7 +13738,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14430,7 +13753,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14445,7 +13767,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14461,7 +13782,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14478,7 +13798,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14493,8 +13812,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14513,8 +13830,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14536,6 +13851,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14557,7 +13873,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14582,7 +13898,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14601,7 +13916,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14616,6 +13930,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14637,7 +13952,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14652,7 +13966,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14670,7 +13983,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14685,7 +13997,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14703,7 +14014,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14722,7 +14032,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14737,8 +14046,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14759,8 +14066,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14782,6 +14087,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -14803,7 +14109,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -14828,7 +14134,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14847,7 +14152,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14862,6 +14166,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -14883,7 +14188,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14898,7 +14202,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -14916,7 +14219,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14931,7 +14233,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -14949,7 +14250,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14968,7 +14268,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14983,8 +14282,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15005,8 +14302,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15028,6 +14323,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15049,7 +14345,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15074,7 +14370,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15091,7 +14386,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15106,6 +14400,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15127,7 +14422,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15142,7 +14436,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15159,7 +14452,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15174,7 +14466,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15191,7 +14482,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15208,7 +14498,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15223,8 +14512,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15241,8 +14528,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15264,6 +14549,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15285,7 +14571,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15310,7 +14596,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15327,7 +14612,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15342,6 +14626,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15363,7 +14648,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15378,7 +14662,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15395,7 +14678,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15410,7 +14692,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15427,7 +14708,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15444,7 +14724,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15459,8 +14738,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15477,8 +14754,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15500,6 +14775,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15521,7 +14797,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15546,7 +14822,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15565,7 +14840,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15580,6 +14854,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15601,7 +14876,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15616,7 +14890,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15634,7 +14907,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15649,7 +14921,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15667,7 +14938,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15686,7 +14956,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15701,8 +14970,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15723,8 +14990,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15746,6 +15011,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -15767,7 +15033,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -15792,7 +15058,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15811,7 +15076,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15826,6 +15090,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -15847,7 +15112,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15862,7 +15126,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -15880,7 +15143,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15895,7 +15157,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -15913,7 +15174,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15932,7 +15192,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15947,8 +15206,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15969,8 +15226,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15992,6 +15247,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16013,7 +15269,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16038,7 +15294,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16057,7 +15312,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16072,6 +15326,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16093,7 +15348,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16108,7 +15362,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16126,7 +15379,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16141,7 +15393,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16159,7 +15410,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16178,7 +15428,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16193,8 +15442,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16215,8 +15462,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16238,6 +15483,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16259,7 +15505,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16284,7 +15530,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16303,7 +15548,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16318,6 +15562,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16339,7 +15584,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16354,7 +15598,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16372,7 +15615,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16387,7 +15629,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16405,7 +15646,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16424,7 +15664,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16439,8 +15678,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16461,8 +15698,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16484,6 +15719,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16505,7 +15741,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16530,7 +15766,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16549,7 +15784,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16564,6 +15798,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16585,7 +15820,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16600,7 +15834,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16618,7 +15851,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16633,7 +15865,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16651,7 +15882,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16670,7 +15900,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16685,8 +15914,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16707,8 +15934,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16730,6 +15955,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16751,7 +15977,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -16776,7 +16002,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16795,7 +16020,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16810,6 +16034,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -16831,7 +16056,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16846,7 +16070,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -16864,7 +16087,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16879,7 +16101,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -16897,7 +16118,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16916,7 +16136,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16931,8 +16150,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16953,8 +16170,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16976,6 +16191,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -16997,7 +16213,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17022,7 +16238,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17041,7 +16256,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17056,6 +16270,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17077,7 +16292,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17092,7 +16306,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17110,7 +16323,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17125,7 +16337,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17143,7 +16354,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17162,7 +16372,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17177,8 +16386,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17199,8 +16406,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17222,6 +16427,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17243,7 +16449,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 @@ -17268,7 +16474,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17287,7 +16492,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17302,6 +16506,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17323,7 +16528,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17338,7 +16542,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17356,7 +16559,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17371,7 +16573,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17389,7 +16590,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17408,7 +16608,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17423,8 +16622,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17445,8 +16642,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17468,6 +16663,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17492,6 +16688,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17520,7 +16717,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17537,7 +16733,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17554,6 +16749,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17578,7 +16774,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17595,7 +16790,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17612,7 +16806,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17629,7 +16822,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17646,7 +16838,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17663,7 +16854,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17680,8 +16870,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17698,8 +16886,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17725,6 +16911,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17749,6 +16936,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -17777,7 +16965,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17795,7 +16982,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17812,6 +16998,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -17836,7 +17023,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17853,7 +17039,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -17871,7 +17056,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17888,7 +17072,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -17906,7 +17089,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17924,7 +17106,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17941,8 +17122,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17960,8 +17139,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17987,6 +17164,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18011,6 +17189,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18039,7 +17218,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18058,7 +17236,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18075,6 +17252,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18099,7 +17277,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18116,7 +17293,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18134,7 +17310,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18151,7 +17326,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18169,7 +17343,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18188,7 +17361,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18205,8 +17377,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18227,8 +17397,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18254,6 +17422,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18278,6 +17447,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18306,7 +17476,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18326,7 +17495,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18343,6 +17511,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18367,7 +17536,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18384,7 +17552,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18403,7 +17570,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18420,7 +17586,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18439,7 +17604,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18459,7 +17623,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18476,8 +17639,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18501,8 +17662,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18528,6 +17687,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18552,6 +17712,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18580,7 +17741,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18600,7 +17760,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18617,6 +17776,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18641,7 +17801,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18658,7 +17817,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18677,7 +17835,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18694,7 +17851,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18713,7 +17869,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18733,7 +17888,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18750,8 +17904,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18775,8 +17927,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18802,6 +17952,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18826,6 +17977,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -18854,7 +18006,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18872,7 +18023,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18889,6 +18039,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -18913,7 +18064,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18930,7 +18080,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -18948,7 +18097,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18965,7 +18113,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -18983,7 +18130,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19001,7 +18147,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19018,8 +18163,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19039,8 +18182,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19066,6 +18207,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19090,6 +18232,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19118,7 +18261,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19136,7 +18278,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19153,6 +18294,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19177,7 +18319,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19194,7 +18335,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19212,7 +18352,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19229,7 +18368,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19247,7 +18385,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19265,7 +18402,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19282,8 +18418,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19301,8 +18435,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19328,6 +18460,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19352,6 +18485,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19380,7 +18514,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19400,7 +18533,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19417,6 +18549,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19441,7 +18574,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19458,7 +18590,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19477,7 +18608,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19494,7 +18624,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19513,7 +18642,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19533,7 +18661,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19550,8 +18677,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19575,8 +18700,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19602,6 +18725,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19626,6 +18750,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19654,7 +18779,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19674,7 +18798,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19691,6 +18814,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19715,7 +18839,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19732,7 +18855,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19751,7 +18873,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19768,7 +18889,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -19787,7 +18907,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19807,7 +18926,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19824,8 +18942,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19849,8 +18965,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19876,6 +18990,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19900,6 +19015,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -19928,7 +19044,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19948,7 +19063,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -19965,6 +19079,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -19989,7 +19104,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20006,7 +19120,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20025,7 +19138,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20042,7 +19154,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20061,7 +19172,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20081,7 +19191,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20098,8 +19207,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20123,8 +19230,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20150,6 +19255,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20174,6 +19280,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20202,7 +19309,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20222,7 +19328,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20239,6 +19344,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20263,7 +19369,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20280,7 +19385,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20299,7 +19403,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20316,7 +19419,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20335,7 +19437,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20355,7 +19456,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20372,8 +19472,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20397,8 +19495,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20424,6 +19520,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20448,6 +19545,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20476,7 +19574,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20496,7 +19593,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20513,6 +19609,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20537,7 +19634,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20554,7 +19650,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20573,7 +19668,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20590,7 +19684,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20609,7 +19702,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20629,7 +19721,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20646,8 +19737,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20669,8 +19758,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20696,6 +19783,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20720,6 +19808,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20748,7 +19837,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20768,7 +19856,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20785,6 +19872,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -20809,7 +19897,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20826,7 +19913,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -20845,7 +19931,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20862,7 +19947,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -20881,7 +19965,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20901,7 +19984,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20918,8 +20000,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20943,8 +20023,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20970,6 +20048,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -20994,6 +20073,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21022,7 +20102,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21042,7 +20121,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21059,6 +20137,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21083,7 +20162,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21100,7 +20178,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21119,7 +20196,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21136,7 +20212,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21155,7 +20230,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21175,7 +20249,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21192,8 +20265,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21217,8 +20288,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21244,6 +20313,7 @@ entry: define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21268,6 +20338,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 @@ -21296,7 +20367,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21316,7 +20386,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21333,6 +20402,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 @@ -21357,7 +20427,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21374,7 +20443,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 @@ -21393,7 +20461,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21410,7 +20477,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 @@ -21429,7 +20495,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21449,7 +20514,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21466,8 +20530,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21491,8 +20553,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index b4a95d23788a9a..38b1e5407d3e64 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -16,8 +16,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; GFX6-LABEL: local_agent_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -32,8 +32,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX7-LABEL: local_agent_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -46,8 +46,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX10-WGP-LABEL: local_agent_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -58,8 +58,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX10-CU-LABEL: local_agent_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -70,8 +70,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; SKIP-CACHE-INV-LABEL: local_agent_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -84,8 +84,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -96,8 +96,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: local_agent_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -108,8 +108,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -120,8 +120,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX940-TGSPLIT-LABEL: local_agent_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -132,8 +132,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX11-WGP-LABEL: local_agent_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -144,8 +144,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX11-CU-LABEL: local_agent_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -156,8 +156,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX12-WGP-LABEL: local_agent_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -168,8 +168,8 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX12-CU-LABEL: local_agent_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -187,8 +187,8 @@ entry: define amdgpu_kernel void @local_agent_monotonic_load( ; GFX6-LABEL: local_agent_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -203,8 +203,8 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX7-LABEL: local_agent_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -217,8 +217,8 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX10-WGP-LABEL: local_agent_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -229,8 +229,8 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX10-CU-LABEL: local_agent_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -241,8 +241,8 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -255,8 +255,8 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -267,8 +267,8 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -279,8 +279,8 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -291,8 +291,8 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -303,8 +303,8 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX11-WGP-LABEL: local_agent_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -315,8 +315,8 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX11-CU-LABEL: local_agent_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -327,8 +327,8 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX12-WGP-LABEL: local_agent_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -339,8 +339,8 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX12-CU-LABEL: local_agent_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -358,8 +358,8 @@ entry: define amdgpu_kernel void @local_agent_acquire_load( ; GFX6-LABEL: local_agent_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -374,8 +374,8 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX7-LABEL: local_agent_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -388,8 +388,8 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX10-WGP-LABEL: local_agent_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -401,8 +401,8 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX10-CU-LABEL: local_agent_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -413,8 +413,8 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -427,8 +427,8 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -439,8 +439,8 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -452,8 +452,8 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -464,8 +464,8 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX940-TGSPLIT-LABEL: local_agent_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -477,8 +477,8 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX11-WGP-LABEL: local_agent_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -490,8 +490,8 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX11-CU-LABEL: local_agent_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -502,8 +502,8 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX12-WGP-LABEL: local_agent_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -515,8 +515,8 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX12-CU-LABEL: local_agent_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -534,8 +534,8 @@ entry: define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX6-LABEL: local_agent_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -551,8 +551,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX7-LABEL: local_agent_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -566,8 +566,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -581,8 +581,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX10-CU-LABEL: local_agent_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -594,8 +594,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -609,8 +609,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -622,8 +622,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -636,8 +636,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -649,8 +649,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -663,8 +663,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -678,8 +678,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX11-CU-LABEL: local_agent_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -691,8 +691,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -708,8 +708,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX12-CU-LABEL: local_agent_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -728,9 +728,9 @@ entry: define amdgpu_kernel void @local_agent_unordered_store( ; GFX6-LABEL: local_agent_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -740,8 +740,8 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; GFX7-LABEL: local_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -751,8 +751,8 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; GFX10-WGP-LABEL: local_agent_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -761,8 +761,8 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; GFX10-CU-LABEL: local_agent_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -771,8 +771,8 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; SKIP-CACHE-INV-LABEL: local_agent_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -782,8 +782,8 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -792,8 +792,8 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: local_agent_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -802,8 +802,8 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -812,8 +812,8 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; GFX940-TGSPLIT-LABEL: local_agent_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -822,8 +822,8 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; GFX11-WGP-LABEL: local_agent_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -832,8 +832,8 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; GFX11-CU-LABEL: local_agent_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -842,8 +842,8 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; GFX12-WGP-LABEL: local_agent_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -852,8 +852,8 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; GFX12-CU-LABEL: local_agent_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -868,9 +868,9 @@ entry: define amdgpu_kernel void @local_agent_monotonic_store( ; GFX6-LABEL: local_agent_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -880,8 +880,8 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; GFX7-LABEL: local_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -891,8 +891,8 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; GFX10-WGP-LABEL: local_agent_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -901,8 +901,8 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; GFX10-CU-LABEL: local_agent_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -911,8 +911,8 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -922,8 +922,8 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -932,8 +932,8 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -942,8 +942,8 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -952,8 +952,8 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -962,8 +962,8 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; GFX11-WGP-LABEL: local_agent_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -972,8 +972,8 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; GFX11-CU-LABEL: local_agent_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -982,8 +982,8 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; GFX12-WGP-LABEL: local_agent_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -992,8 +992,8 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; GFX12-CU-LABEL: local_agent_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1008,9 +1008,9 @@ entry: define amdgpu_kernel void @local_agent_release_store( ; GFX6-LABEL: local_agent_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1021,8 +1021,8 @@ define amdgpu_kernel void @local_agent_release_store( ; ; GFX7-LABEL: local_agent_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1033,8 +1033,8 @@ define amdgpu_kernel void @local_agent_release_store( ; ; GFX10-WGP-LABEL: local_agent_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1045,8 +1045,8 @@ define amdgpu_kernel void @local_agent_release_store( ; ; GFX10-CU-LABEL: local_agent_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1056,8 +1056,8 @@ define amdgpu_kernel void @local_agent_release_store( ; ; SKIP-CACHE-INV-LABEL: local_agent_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1068,8 +1068,8 @@ define amdgpu_kernel void @local_agent_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1079,8 +1079,8 @@ define amdgpu_kernel void @local_agent_release_store( ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1090,8 +1090,8 @@ define amdgpu_kernel void @local_agent_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1101,8 +1101,8 @@ define amdgpu_kernel void @local_agent_release_store( ; ; GFX940-TGSPLIT-LABEL: local_agent_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1112,8 +1112,8 @@ define amdgpu_kernel void @local_agent_release_store( ; ; GFX11-WGP-LABEL: local_agent_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1124,8 +1124,8 @@ define amdgpu_kernel void @local_agent_release_store( ; ; GFX11-CU-LABEL: local_agent_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1135,8 +1135,8 @@ define amdgpu_kernel void @local_agent_release_store( ; ; GFX12-WGP-LABEL: local_agent_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1149,8 +1149,8 @@ define amdgpu_kernel void @local_agent_release_store( ; ; GFX12-CU-LABEL: local_agent_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1166,9 +1166,9 @@ entry: define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX6-LABEL: local_agent_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1179,8 +1179,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; GFX7-LABEL: local_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1191,8 +1191,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1203,8 +1203,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; GFX10-CU-LABEL: local_agent_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1214,8 +1214,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1226,8 +1226,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1237,8 +1237,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1248,8 +1248,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1259,8 +1259,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1270,8 +1270,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1282,8 +1282,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; GFX11-CU-LABEL: local_agent_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1293,8 +1293,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1307,8 +1307,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; GFX12-CU-LABEL: local_agent_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1324,9 +1324,9 @@ entry: define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; GFX6-LABEL: local_agent_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1336,8 +1336,8 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; GFX7-LABEL: local_agent_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1347,8 +1347,8 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1357,8 +1357,8 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1367,8 +1367,8 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1378,8 +1378,8 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1388,8 +1388,8 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1398,8 +1398,8 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1408,8 +1408,8 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1418,8 +1418,8 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1428,8 +1428,8 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1438,8 +1438,8 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1448,8 +1448,8 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1464,9 +1464,9 @@ entry: define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX6-LABEL: local_agent_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1477,8 +1477,8 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX7-LABEL: local_agent_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1489,8 +1489,8 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1501,8 +1501,8 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1512,8 +1512,8 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1524,8 +1524,8 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1535,8 +1535,8 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1546,8 +1546,8 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1557,8 +1557,8 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1568,8 +1568,8 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1580,8 +1580,8 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1591,8 +1591,8 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1603,8 +1603,8 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1620,9 +1620,9 @@ entry: define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX6-LABEL: local_agent_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1633,8 +1633,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX7-LABEL: local_agent_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1645,8 +1645,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1657,8 +1657,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1668,8 +1668,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1680,8 +1680,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1691,8 +1691,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1702,8 +1702,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1713,8 +1713,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1724,8 +1724,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1736,8 +1736,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1747,8 +1747,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1761,8 +1761,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1778,9 +1778,9 @@ entry: define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX6-LABEL: local_agent_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1792,8 +1792,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_agent_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1805,8 +1805,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1819,8 +1819,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1831,8 +1831,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1844,8 +1844,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1856,8 +1856,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1868,8 +1868,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1880,8 +1880,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1892,8 +1892,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1906,8 +1906,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1918,8 +1918,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1934,8 +1934,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1952,9 +1952,9 @@ entry: define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX6-LABEL: local_agent_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1966,8 +1966,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_agent_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1979,8 +1979,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1993,8 +1993,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -2005,8 +2005,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -2018,8 +2018,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -2030,8 +2030,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -2042,8 +2042,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -2054,8 +2054,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -2066,8 +2066,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -2080,8 +2080,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -2092,8 +2092,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -2108,8 +2108,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -2126,9 +2126,9 @@ entry: define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX6-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2142,8 +2142,8 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; GFX7-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2157,8 +2157,8 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2171,8 +2171,8 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2184,8 +2184,8 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2199,8 +2199,8 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2212,8 +2212,8 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2226,8 +2226,8 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2239,8 +2239,8 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2253,8 +2253,8 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2267,8 +2267,8 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2280,8 +2280,8 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2294,8 +2294,8 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2314,9 +2314,9 @@ entry: define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2331,8 +2331,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2347,8 +2347,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2363,8 +2363,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2377,8 +2377,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2393,8 +2393,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2407,8 +2407,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2422,8 +2422,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2436,8 +2436,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2451,8 +2451,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2467,8 +2467,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2481,8 +2481,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2499,8 +2499,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2520,9 +2520,9 @@ entry: define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2537,8 +2537,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2553,8 +2553,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2569,8 +2569,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2583,8 +2583,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2599,8 +2599,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2613,8 +2613,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2628,8 +2628,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2642,8 +2642,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2657,8 +2657,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2673,8 +2673,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2687,8 +2687,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2705,8 +2705,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2726,7 +2726,6 @@ entry: define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -2741,7 +2740,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -2755,7 +2753,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2768,7 +2765,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2781,7 +2777,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -2795,7 +2790,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2808,7 +2802,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2821,7 +2814,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2834,7 +2826,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2847,7 +2838,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2860,7 +2850,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2873,8 +2862,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2887,8 +2874,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2908,7 +2893,6 @@ entry: define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -2924,7 +2908,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -2939,7 +2922,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2954,7 +2936,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2968,7 +2949,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -2983,7 +2963,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2997,7 +2976,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3011,7 +2989,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3025,7 +3002,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3039,7 +3015,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3054,7 +3029,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3068,8 +3042,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3084,8 +3056,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3106,7 +3076,6 @@ entry: define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3122,7 +3091,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3137,7 +3105,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3152,7 +3119,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3166,7 +3132,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3181,7 +3146,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3195,7 +3159,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3209,7 +3172,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3223,7 +3185,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3237,7 +3198,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3252,7 +3212,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3266,8 +3225,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3284,8 +3241,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3306,7 +3261,6 @@ entry: define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3323,7 +3277,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3339,7 +3292,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3356,7 +3308,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3371,7 +3322,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3387,7 +3337,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3402,7 +3351,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3417,7 +3365,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3432,7 +3379,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3447,7 +3393,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3464,7 +3409,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3479,8 +3423,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3499,8 +3441,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3522,7 +3462,6 @@ entry: define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3539,7 +3478,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3555,7 +3493,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3572,7 +3509,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3587,7 +3523,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3603,7 +3538,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3618,7 +3552,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3633,7 +3566,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3648,7 +3580,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3663,7 +3594,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3680,7 +3610,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3695,8 +3624,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3715,8 +3642,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3738,7 +3663,6 @@ entry: define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3754,7 +3678,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3769,7 +3692,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3784,7 +3706,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3798,7 +3719,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3813,7 +3733,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3827,7 +3746,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3841,7 +3759,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3855,7 +3772,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3869,7 +3785,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3884,7 +3799,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3898,8 +3812,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3914,8 +3826,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3936,7 +3846,6 @@ entry: define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3952,7 +3861,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3967,7 +3875,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3982,7 +3889,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3996,7 +3902,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4011,7 +3916,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4025,7 +3929,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4039,7 +3942,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4053,7 +3955,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4067,7 +3968,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4082,7 +3982,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4096,8 +3995,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4112,8 +4009,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4134,7 +4029,6 @@ entry: define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX6-LABEL: local_agent_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4151,7 +4045,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4167,7 +4060,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4184,7 +4076,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4199,7 +4090,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4215,7 +4105,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4230,7 +4119,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4245,7 +4133,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4260,7 +4147,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4275,7 +4161,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4292,7 +4177,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4307,8 +4191,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4327,8 +4209,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4350,7 +4230,6 @@ entry: define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4367,7 +4246,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4383,7 +4261,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4400,7 +4277,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4415,7 +4291,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4431,7 +4306,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4446,7 +4320,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4461,7 +4334,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4476,7 +4348,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4491,7 +4362,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4508,7 +4378,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4523,8 +4392,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4543,8 +4410,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4566,7 +4431,6 @@ entry: define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4583,7 +4447,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4599,7 +4462,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4616,7 +4478,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4631,7 +4492,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4647,7 +4507,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4662,7 +4521,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4677,7 +4535,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4692,7 +4549,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4707,7 +4563,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4724,7 +4579,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4739,8 +4593,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4759,8 +4611,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4782,7 +4632,6 @@ entry: define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4799,7 +4648,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4815,7 +4663,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4832,7 +4679,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4847,7 +4693,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4863,7 +4708,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4878,7 +4722,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4893,7 +4736,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4908,7 +4750,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4923,7 +4764,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4940,7 +4780,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4955,8 +4794,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4975,8 +4812,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4998,7 +4833,6 @@ entry: define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5015,7 +4849,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5031,7 +4864,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5048,7 +4880,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5063,7 +4894,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5079,7 +4909,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5094,7 +4923,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5109,7 +4937,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5124,7 +4951,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5139,7 +4965,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5156,7 +4981,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5171,8 +4995,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5191,8 +5013,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5214,7 +5034,6 @@ entry: define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5231,7 +5050,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5247,7 +5065,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5264,7 +5081,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5279,7 +5095,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5295,7 +5110,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5310,7 +5124,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5325,7 +5138,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5340,7 +5152,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5355,7 +5166,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5372,7 +5182,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5387,8 +5196,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5407,8 +5214,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5430,7 +5235,6 @@ entry: define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5447,7 +5251,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5463,7 +5266,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5480,7 +5282,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5495,7 +5296,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5511,7 +5311,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5526,7 +5325,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5541,7 +5339,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5556,7 +5353,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5571,7 +5367,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5588,7 +5383,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5603,8 +5397,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5623,8 +5415,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5646,7 +5436,6 @@ entry: define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5663,7 +5452,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5679,7 +5467,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5696,7 +5483,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5711,7 +5497,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5727,7 +5512,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5742,7 +5526,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5757,7 +5540,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5772,7 +5554,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5787,7 +5568,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5804,7 +5584,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5819,8 +5598,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5839,8 +5616,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5862,10 +5637,10 @@ entry: define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5880,9 +5655,9 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5897,10 +5672,9 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -5913,10 +5687,9 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -5929,9 +5702,9 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5946,10 +5719,9 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5962,10 +5734,9 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5978,10 +5749,9 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5994,10 +5764,9 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6010,7 +5779,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6026,7 +5794,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6042,8 +5809,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6059,8 +5824,6 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6085,10 +5848,10 @@ entry: define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6103,9 +5866,9 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6120,10 +5883,9 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6137,10 +5899,9 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6153,9 +5914,9 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6170,10 +5931,9 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6186,10 +5946,9 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6203,10 +5962,9 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6219,10 +5977,9 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6236,7 +5993,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6253,7 +6009,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6269,8 +6024,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6287,8 +6040,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6313,10 +6064,10 @@ entry: define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6332,9 +6083,9 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6350,10 +6101,9 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6368,10 +6118,9 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6385,9 +6134,9 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6403,10 +6152,9 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6420,10 +6168,9 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6437,10 +6184,9 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6454,10 +6200,9 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6471,7 +6216,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6489,7 +6233,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6506,8 +6249,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6527,8 +6268,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6554,10 +6293,10 @@ entry: define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6573,9 +6312,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6591,10 +6330,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6610,10 +6348,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6627,9 +6364,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6645,10 +6382,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6662,10 +6398,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6680,10 +6415,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6697,10 +6431,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6715,7 +6448,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6734,7 +6466,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6751,8 +6482,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6773,8 +6502,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6800,10 +6527,10 @@ entry: define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6819,9 +6546,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6837,10 +6564,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6856,10 +6582,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6873,9 +6598,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6891,10 +6616,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6908,10 +6632,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6926,10 +6649,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6943,10 +6665,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6961,7 +6682,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6980,7 +6700,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6997,8 +6716,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7019,8 +6736,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7046,10 +6761,10 @@ entry: define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7064,9 +6779,9 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7081,10 +6796,9 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7098,10 +6812,9 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7114,9 +6827,9 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7131,10 +6844,9 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7147,10 +6859,9 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7164,10 +6875,9 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7180,10 +6890,9 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7197,7 +6906,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7214,7 +6922,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7230,8 +6937,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7248,8 +6953,6 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7274,10 +6977,10 @@ entry: define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7292,9 +6995,9 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7309,10 +7012,9 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7326,10 +7028,9 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7342,9 +7043,9 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7359,10 +7060,9 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7375,10 +7075,9 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7392,10 +7091,9 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7408,10 +7106,9 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7425,7 +7122,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7442,7 +7138,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7458,8 +7153,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7476,8 +7169,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7502,10 +7193,10 @@ entry: define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7521,9 +7212,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7539,10 +7230,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7558,10 +7248,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7575,9 +7264,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7593,10 +7282,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7610,10 +7298,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7628,10 +7315,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7645,10 +7331,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7663,7 +7348,6 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7682,7 +7366,6 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7699,8 +7382,6 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7721,8 +7402,6 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7748,10 +7427,10 @@ entry: define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7767,9 +7446,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7785,10 +7464,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7804,10 +7482,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7821,9 +7498,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7839,10 +7516,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7856,10 +7532,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7874,10 +7549,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7891,10 +7565,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7909,7 +7582,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7928,7 +7600,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7945,8 +7616,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7967,8 +7636,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7994,10 +7661,10 @@ entry: define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8013,9 +7680,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8031,10 +7698,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8050,10 +7716,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8067,9 +7732,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8085,10 +7750,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8102,10 +7766,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8120,10 +7783,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8137,10 +7799,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8155,7 +7816,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8174,7 +7834,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8191,8 +7850,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8213,8 +7870,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8240,10 +7895,10 @@ entry: define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8259,9 +7914,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8277,10 +7932,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8296,10 +7950,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8313,9 +7966,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8331,10 +7984,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8348,10 +8000,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8366,10 +8017,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8383,10 +8033,9 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8401,7 +8050,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8420,7 +8068,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8437,8 +8084,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8459,8 +8104,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8486,10 +8129,10 @@ entry: define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8505,9 +8148,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8523,10 +8166,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8542,10 +8184,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8559,9 +8200,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8577,10 +8218,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8594,10 +8234,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8612,10 +8251,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8629,10 +8267,9 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8647,7 +8284,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8666,7 +8302,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8683,8 +8318,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8705,8 +8338,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8732,10 +8363,10 @@ entry: define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8751,9 +8382,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8769,10 +8400,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8788,10 +8418,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8805,9 +8434,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8823,10 +8452,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8840,10 +8468,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8858,10 +8485,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8875,10 +8501,9 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8893,7 +8518,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8912,7 +8536,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8929,8 +8552,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8951,8 +8572,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8978,10 +8597,10 @@ entry: define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8997,9 +8616,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -9015,10 +8634,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -9034,10 +8652,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -9051,9 +8668,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -9069,10 +8686,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -9086,10 +8702,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -9104,10 +8719,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -9121,10 +8735,9 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -9139,7 +8752,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9158,7 +8770,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9175,8 +8786,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9197,8 +8806,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9224,10 +8831,10 @@ entry: define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -9243,9 +8850,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -9261,10 +8868,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -9280,10 +8886,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -9297,9 +8902,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -9315,10 +8920,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -9332,10 +8936,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -9350,10 +8953,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -9367,10 +8969,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -9385,7 +8986,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9404,7 +9004,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9421,8 +9020,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9443,8 +9040,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9470,8 +9065,8 @@ entry: define amdgpu_kernel void @local_agent_one_as_unordered_load( ; GFX6-LABEL: local_agent_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9486,8 +9081,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX7-LABEL: local_agent_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9500,8 +9095,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX10-WGP-LABEL: local_agent_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -9512,8 +9107,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX10-CU-LABEL: local_agent_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -9524,8 +9119,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9538,8 +9133,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9550,8 +9145,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9562,8 +9157,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9574,8 +9169,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9586,8 +9181,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX11-WGP-LABEL: local_agent_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9598,8 +9193,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX11-CU-LABEL: local_agent_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9610,8 +9205,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX12-WGP-LABEL: local_agent_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9622,8 +9217,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX12-CU-LABEL: local_agent_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9641,8 +9236,8 @@ entry: define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; GFX6-LABEL: local_agent_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9657,8 +9252,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX7-LABEL: local_agent_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9671,8 +9266,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -9683,8 +9278,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -9695,8 +9290,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9709,8 +9304,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9721,8 +9316,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9733,8 +9328,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9745,8 +9340,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9757,8 +9352,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9769,8 +9364,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9781,8 +9376,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9793,8 +9388,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9812,8 +9407,8 @@ entry: define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX6-LABEL: local_agent_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9828,8 +9423,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX7-LABEL: local_agent_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9842,8 +9437,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -9854,8 +9449,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -9866,8 +9461,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9880,8 +9475,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9892,8 +9487,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9904,8 +9499,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9916,8 +9511,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9928,8 +9523,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9940,8 +9535,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9952,8 +9547,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9964,8 +9559,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9983,8 +9578,8 @@ entry: define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX6-LABEL: local_agent_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9999,8 +9594,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10013,8 +9608,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -10025,8 +9620,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -10037,8 +9632,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10051,8 +9646,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -10063,8 +9658,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -10075,8 +9670,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -10087,8 +9682,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -10099,8 +9694,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -10111,8 +9706,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -10123,8 +9718,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -10135,8 +9730,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -10154,9 +9749,9 @@ entry: define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX6-LABEL: local_agent_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10166,8 +9761,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; GFX7-LABEL: local_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10177,8 +9772,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; GFX10-WGP-LABEL: local_agent_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10187,8 +9782,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; GFX10-CU-LABEL: local_agent_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10197,8 +9792,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10208,8 +9803,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10218,8 +9813,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10228,8 +9823,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10238,8 +9833,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10248,8 +9843,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; GFX11-WGP-LABEL: local_agent_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10258,8 +9853,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; GFX11-CU-LABEL: local_agent_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10268,8 +9863,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; GFX12-WGP-LABEL: local_agent_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10278,8 +9873,8 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; GFX12-CU-LABEL: local_agent_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10294,9 +9889,9 @@ entry: define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX6-LABEL: local_agent_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10306,8 +9901,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; GFX7-LABEL: local_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10317,8 +9912,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10327,8 +9922,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10337,8 +9932,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10348,8 +9943,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10358,8 +9953,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10368,8 +9963,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10378,8 +9973,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10388,8 +9983,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10398,8 +9993,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10408,8 +10003,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10418,8 +10013,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10434,9 +10029,9 @@ entry: define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX6-LABEL: local_agent_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10446,8 +10041,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; GFX7-LABEL: local_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10457,8 +10052,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; GFX10-WGP-LABEL: local_agent_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10467,8 +10062,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; GFX10-CU-LABEL: local_agent_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10477,8 +10072,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10488,8 +10083,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10498,8 +10093,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10508,8 +10103,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10518,8 +10113,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10528,8 +10123,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; GFX11-WGP-LABEL: local_agent_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10538,8 +10133,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; GFX11-CU-LABEL: local_agent_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10548,8 +10143,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; GFX12-WGP-LABEL: local_agent_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10558,8 +10153,8 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; GFX12-CU-LABEL: local_agent_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10574,9 +10169,9 @@ entry: define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX6-LABEL: local_agent_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10586,8 +10181,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10597,8 +10192,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10607,8 +10202,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10617,8 +10212,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10628,8 +10223,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10638,8 +10233,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10648,8 +10243,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10658,8 +10253,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10668,8 +10263,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10678,8 +10273,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10688,8 +10283,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10698,8 +10293,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10714,9 +10309,9 @@ entry: define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10726,8 +10321,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10737,8 +10332,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10747,8 +10342,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10757,8 +10352,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10768,8 +10363,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10778,8 +10373,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10788,8 +10383,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10798,8 +10393,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10808,8 +10403,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10818,8 +10413,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10828,8 +10423,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10838,8 +10433,8 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10854,9 +10449,9 @@ entry: define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10866,8 +10461,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10877,8 +10472,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10887,8 +10482,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10897,8 +10492,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10908,8 +10503,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10918,8 +10513,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10928,8 +10523,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10938,8 +10533,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10948,8 +10543,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10958,8 +10553,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10968,8 +10563,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10978,8 +10573,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10994,9 +10589,9 @@ entry: define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX6-LABEL: local_agent_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -11006,8 +10601,8 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; GFX7-LABEL: local_agent_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -11017,8 +10612,8 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -11027,8 +10622,8 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -11037,8 +10632,8 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -11048,8 +10643,8 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11058,8 +10653,8 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11068,8 +10663,8 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11078,8 +10673,8 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11088,8 +10683,8 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11098,8 +10693,8 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11108,8 +10703,8 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11118,8 +10713,8 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11134,9 +10729,9 @@ entry: define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -11146,8 +10741,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -11157,8 +10752,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -11167,8 +10762,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -11177,8 +10772,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -11188,8 +10783,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11198,8 +10793,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11208,8 +10803,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11218,8 +10813,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11228,8 +10823,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11238,8 +10833,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11248,8 +10843,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11258,8 +10853,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11274,9 +10869,9 @@ entry: define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -11286,8 +10881,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -11297,8 +10892,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -11307,8 +10902,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -11317,8 +10912,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -11328,8 +10923,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11338,8 +10933,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11348,8 +10943,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11358,8 +10953,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11368,8 +10963,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11378,8 +10973,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11388,8 +10983,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11398,8 +10993,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11414,9 +11009,9 @@ entry: define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -11430,8 +11025,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -11445,8 +11040,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -11458,8 +11053,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -11471,8 +11066,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -11486,8 +11081,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11499,8 +11094,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11512,8 +11107,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11525,8 +11120,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11538,8 +11133,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11551,8 +11146,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11564,8 +11159,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11577,8 +11172,8 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11597,9 +11192,9 @@ entry: define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -11613,8 +11208,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -11628,8 +11223,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -11641,8 +11236,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -11654,8 +11249,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -11669,8 +11264,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11682,8 +11277,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11695,8 +11290,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11708,8 +11303,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11721,8 +11316,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11734,8 +11329,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11747,8 +11342,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11760,8 +11355,8 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11780,9 +11375,9 @@ entry: define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -11796,8 +11391,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -11811,8 +11406,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -11824,8 +11419,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -11837,8 +11432,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -11852,8 +11447,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11865,8 +11460,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11878,8 +11473,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11891,8 +11486,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11904,8 +11499,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11917,8 +11512,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11930,8 +11525,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11943,8 +11538,8 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11963,7 +11558,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11978,7 +11572,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11992,7 +11585,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12005,7 +11597,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12018,7 +11609,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12032,7 +11622,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12045,7 +11634,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12058,7 +11646,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12071,7 +11658,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12084,7 +11670,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12097,7 +11682,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12110,8 +11694,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12124,8 +11706,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12145,7 +11725,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12160,7 +11739,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12174,7 +11752,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12187,7 +11764,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12200,7 +11776,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12214,7 +11789,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12227,7 +11801,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12240,7 +11813,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12253,7 +11825,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12266,7 +11837,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12279,7 +11849,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12292,8 +11861,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12306,8 +11873,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12327,7 +11892,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12342,7 +11906,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12356,7 +11919,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12369,7 +11931,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12382,7 +11943,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12396,7 +11956,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12409,7 +11968,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12422,7 +11980,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12435,7 +11992,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12448,7 +12004,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12461,7 +12016,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12474,8 +12028,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12488,8 +12040,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12509,7 +12059,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12524,7 +12073,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12538,7 +12086,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12551,7 +12098,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12564,7 +12110,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12578,7 +12123,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12591,7 +12135,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12604,7 +12147,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12617,7 +12159,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12630,7 +12171,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12643,7 +12183,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12656,8 +12195,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12670,8 +12207,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12691,7 +12226,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12706,7 +12240,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12720,7 +12253,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12733,7 +12265,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12746,7 +12277,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12760,7 +12290,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12773,7 +12302,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12786,7 +12314,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12799,7 +12326,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12812,7 +12338,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12825,7 +12350,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12838,8 +12362,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12852,8 +12374,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12873,7 +12393,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12888,7 +12407,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12902,7 +12420,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12915,7 +12432,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12928,7 +12444,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12942,7 +12457,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12955,7 +12469,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12968,7 +12481,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12981,7 +12493,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12994,7 +12505,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13007,7 +12517,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13020,8 +12529,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13034,8 +12541,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13055,7 +12560,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13070,7 +12574,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13084,7 +12587,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13097,7 +12599,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13110,7 +12611,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13124,7 +12624,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13137,7 +12636,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13150,7 +12648,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13163,7 +12660,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13176,7 +12672,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13189,7 +12684,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13202,8 +12696,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13216,8 +12708,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13237,7 +12727,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13252,7 +12741,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13266,7 +12754,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13279,7 +12766,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13292,7 +12778,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13306,7 +12791,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13319,7 +12803,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13332,7 +12815,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13345,7 +12827,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13358,7 +12839,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13371,7 +12851,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13384,8 +12863,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13398,8 +12875,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13419,7 +12894,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13434,7 +12908,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13448,7 +12921,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13461,7 +12933,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13474,7 +12945,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13488,7 +12958,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13501,7 +12970,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13514,7 +12982,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13527,7 +12994,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13540,7 +13006,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13553,7 +13018,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13566,8 +13030,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13580,8 +13042,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13601,7 +13061,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13616,7 +13075,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13630,7 +13088,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13643,7 +13100,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13656,7 +13112,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13670,7 +13125,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13683,7 +13137,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13696,7 +13149,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13709,7 +13161,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13722,7 +13173,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13735,7 +13185,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13748,8 +13197,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13762,8 +13209,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13783,7 +13228,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13798,7 +13242,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13812,7 +13255,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13825,7 +13267,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13838,7 +13279,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13852,7 +13292,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13865,7 +13304,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13878,7 +13316,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13891,7 +13328,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13904,7 +13340,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13917,7 +13352,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13930,8 +13364,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13944,8 +13376,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13965,7 +13395,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13980,7 +13409,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13994,7 +13422,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14007,7 +13434,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14020,7 +13446,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -14034,7 +13459,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14047,7 +13471,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14060,7 +13483,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14073,7 +13495,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14086,7 +13507,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14099,7 +13519,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14112,8 +13531,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14126,8 +13543,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14147,7 +13562,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -14162,7 +13576,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -14176,7 +13589,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14189,7 +13601,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14202,7 +13613,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -14216,7 +13626,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14229,7 +13638,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14242,7 +13650,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14255,7 +13662,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14268,7 +13674,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14281,7 +13686,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14294,8 +13698,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14308,8 +13710,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14329,7 +13729,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -14344,7 +13743,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -14358,7 +13756,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14371,7 +13768,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14384,7 +13780,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -14398,7 +13793,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14411,7 +13805,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14424,7 +13817,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14437,7 +13829,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14450,7 +13841,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14463,7 +13853,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14476,8 +13865,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14490,8 +13877,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14511,7 +13896,6 @@ entry: define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -14526,7 +13910,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -14540,7 +13923,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14553,7 +13935,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14566,7 +13947,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -14580,7 +13960,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14593,7 +13972,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14606,7 +13984,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14619,7 +13996,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14632,7 +14008,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14645,7 +14020,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14658,8 +14032,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14672,8 +14044,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14693,10 +14063,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14711,9 +14081,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14728,10 +14098,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14744,10 +14113,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14760,9 +14128,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14777,10 +14145,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14793,10 +14160,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14809,10 +14175,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14825,10 +14190,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14841,7 +14205,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14857,7 +14220,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14873,8 +14235,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14890,8 +14250,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14916,10 +14274,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14934,9 +14292,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14951,10 +14309,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14967,10 +14324,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14983,9 +14339,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15000,10 +14356,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15016,10 +14371,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15032,10 +14386,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15048,10 +14401,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15064,7 +14416,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15080,7 +14431,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15096,8 +14446,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15113,8 +14461,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15139,10 +14485,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15157,9 +14503,9 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15174,10 +14520,9 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15190,10 +14535,9 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15206,9 +14550,9 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15223,10 +14567,9 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15239,10 +14582,9 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15255,10 +14597,9 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15271,10 +14612,9 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15287,7 +14627,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15303,7 +14642,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15319,8 +14657,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15336,8 +14672,6 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15362,10 +14696,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15380,9 +14714,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15397,10 +14731,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15413,10 +14746,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15429,9 +14761,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15446,10 +14778,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15462,10 +14793,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15478,10 +14808,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15494,10 +14823,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15510,7 +14838,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15526,7 +14853,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15542,8 +14868,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15559,8 +14883,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15585,10 +14907,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15603,9 +14925,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15620,10 +14942,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15636,10 +14957,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15652,9 +14972,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15669,10 +14989,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15685,10 +15004,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15701,10 +15019,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15717,10 +15034,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15733,7 +15049,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15749,7 +15064,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15765,8 +15079,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15782,8 +15094,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15808,10 +15118,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15826,9 +15136,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15843,10 +15153,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15859,10 +15168,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15875,9 +15183,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15892,10 +15200,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15908,10 +15215,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15924,10 +15230,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15940,10 +15245,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15956,7 +15260,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15972,7 +15275,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15988,8 +15290,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16005,8 +15305,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16031,10 +15329,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16049,9 +15347,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16066,10 +15364,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16082,10 +15379,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16098,9 +15394,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16115,10 +15411,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16131,10 +15426,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16147,10 +15441,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16163,10 +15456,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16179,7 +15471,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16195,7 +15486,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16211,8 +15501,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16228,8 +15516,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16254,10 +15540,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16272,9 +15558,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16289,10 +15575,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16305,10 +15590,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16321,9 +15605,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16338,10 +15622,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16354,10 +15637,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16370,10 +15652,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16386,10 +15667,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16402,7 +15682,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16418,7 +15697,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16434,8 +15712,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16451,8 +15727,6 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16477,10 +15751,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16495,9 +15769,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16512,10 +15786,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16528,10 +15801,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16544,9 +15816,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16561,10 +15833,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16577,10 +15848,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16593,10 +15863,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16609,10 +15878,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16625,7 +15893,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16641,7 +15908,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16657,8 +15923,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16674,8 +15938,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16700,10 +15962,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16718,9 +15980,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16735,10 +15997,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16751,10 +16012,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16767,9 +16027,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16784,10 +16044,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16800,10 +16059,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16816,10 +16074,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16832,10 +16089,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16848,7 +16104,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16864,7 +16119,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16880,8 +16134,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16897,8 +16149,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16923,10 +16173,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16941,9 +16191,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16958,10 +16208,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16974,10 +16223,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16990,9 +16238,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17007,10 +16255,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17023,10 +16270,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17039,10 +16285,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17055,10 +16300,9 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17071,7 +16315,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17087,7 +16330,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17103,8 +16345,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17120,8 +16360,6 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17146,10 +16384,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -17164,9 +16402,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -17181,10 +16419,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -17197,10 +16434,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -17213,9 +16449,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17230,10 +16466,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17246,10 +16481,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17262,10 +16496,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17278,10 +16511,9 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17294,7 +16526,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17310,7 +16541,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17326,8 +16556,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17343,8 +16571,6 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17369,10 +16595,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -17387,9 +16613,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -17404,10 +16630,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -17420,10 +16645,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -17436,9 +16660,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17453,10 +16677,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17469,10 +16692,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17485,10 +16707,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17501,10 +16722,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17517,7 +16737,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17533,7 +16752,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17549,8 +16767,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17566,8 +16782,6 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17592,10 +16806,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -17610,9 +16824,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -17627,10 +16841,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -17643,10 +16856,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -17659,9 +16871,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17676,10 +16888,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17692,10 +16903,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17708,10 +16918,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17724,10 +16933,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17740,7 +16948,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17756,7 +16963,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17772,8 +16978,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17789,8 +16993,6 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17815,10 +17017,10 @@ entry: define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -17833,9 +17035,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -17850,10 +17052,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -17866,10 +17067,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -17882,9 +17082,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17899,10 +17099,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17915,10 +17114,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17931,10 +17129,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17947,10 +17144,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17963,7 +17159,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17979,7 +17174,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17995,8 +17189,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18012,8 +17204,6 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index fce60ff12aed3d..3c485af18166fd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -16,9 +16,10 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; GFX6-LABEL: local_nontemporal_load_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr8 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr8 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -37,9 +38,8 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX7-LABEL: local_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -52,10 +52,8 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX10-WGP-LABEL: local_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -66,10 +64,8 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX10-CU-LABEL: local_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -80,8 +76,9 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -100,10 +97,8 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -114,10 +109,8 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -128,10 +121,8 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_0: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -142,10 +133,8 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX940-TGSPLIT-LABEL: local_nontemporal_load_0: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -156,9 +145,8 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX11-WGP-LABEL: local_nontemporal_load_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -169,9 +157,8 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX11-CU-LABEL: local_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -182,10 +169,8 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX12-WGP-LABEL: local_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -196,10 +181,8 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX12-CU-LABEL: local_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -217,9 +200,10 @@ entry: define amdgpu_kernel void @local_nontemporal_load_1( ; GFX6-LABEL: local_nontemporal_load_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr8 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr8 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -240,9 +224,8 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX7-LABEL: local_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -257,11 +240,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX10-WGP-LABEL: local_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -273,11 +254,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX10-CU-LABEL: local_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -289,8 +268,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -311,11 +291,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 @@ -330,11 +308,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 @@ -349,11 +325,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_1: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff ; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 @@ -368,11 +342,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX940-TGSPLIT-LABEL: local_nontemporal_load_1: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff ; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 @@ -387,10 +359,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX11-WGP-LABEL: local_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 @@ -404,10 +375,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX11-CU-LABEL: local_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 @@ -421,11 +391,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX12-WGP-LABEL: local_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -441,11 +409,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX12-CU-LABEL: local_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-CU-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -470,9 +436,9 @@ entry: define amdgpu_kernel void @local_nontemporal_store_0( ; GFX6-LABEL: local_nontemporal_store_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -484,9 +450,8 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX7-LABEL: local_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -498,10 +463,8 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX10-WGP-LABEL: local_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -512,10 +475,8 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX10-CU-LABEL: local_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s5, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -526,9 +487,8 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; SKIP-CACHE-INV-LABEL: local_nontemporal_store_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -540,10 +500,8 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -554,10 +512,8 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 @@ -568,10 +524,8 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_0: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -582,10 +536,8 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX940-TGSPLIT-LABEL: local_nontemporal_store_0: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 @@ -596,9 +548,8 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX11-WGP-LABEL: local_nontemporal_store_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -609,9 +560,8 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX11-CU-LABEL: local_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -622,10 +572,8 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX12-WGP-LABEL: local_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -636,10 +584,8 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX12-CU-LABEL: local_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -657,9 +603,9 @@ entry: define amdgpu_kernel void @local_nontemporal_store_1( ; GFX6-LABEL: local_nontemporal_store_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_mov_b32 s6, 2 @@ -673,9 +619,8 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX7-LABEL: local_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 2 @@ -689,9 +634,8 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX10-WGP-LABEL: local_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_mov_b32 s5, 2 @@ -703,9 +647,8 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX10-CU-LABEL: local_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-CU-NEXT: s_mov_b32 s5, 2 @@ -717,9 +660,8 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; SKIP-CACHE-INV-LABEL: local_nontemporal_store_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 @@ -733,9 +675,8 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff @@ -750,9 +691,8 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff @@ -767,9 +707,8 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_1: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff @@ -784,9 +723,8 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX940-TGSPLIT-LABEL: local_nontemporal_store_1: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff @@ -801,8 +739,8 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX11-WGP-LABEL: local_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff @@ -816,8 +754,8 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX11-CU-LABEL: local_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff @@ -831,8 +769,8 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX12-WGP-LABEL: local_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff @@ -848,8 +786,8 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX12-CU-LABEL: local_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff @@ -874,9 +812,10 @@ entry: define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX6-LABEL: local_nontemporal_volatile_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr8 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr8 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -895,9 +834,8 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX7-LABEL: local_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -910,10 +848,8 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: local_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -924,10 +860,8 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: local_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -938,8 +872,9 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; SKIP-CACHE-INV-LABEL: local_nontemporal_volatile_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -958,10 +893,8 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -972,10 +905,8 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: local_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -986,10 +917,8 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1000,10 +929,8 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX940-TGSPLIT-LABEL: local_nontemporal_volatile_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -1014,9 +941,8 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX11-WGP-LABEL: local_nontemporal_volatile_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1027,9 +953,8 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX11-CU-LABEL: local_nontemporal_volatile_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -1040,10 +965,8 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX12-WGP-LABEL: local_nontemporal_volatile_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -1054,10 +977,8 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX12-CU-LABEL: local_nontemporal_volatile_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll index 033c71574643cf..31f36a42a2eda2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -16,8 +16,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX6-LABEL: local_singlethread_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -32,8 +32,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX7-LABEL: local_singlethread_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -46,8 +46,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX10-WGP-LABEL: local_singlethread_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -58,8 +58,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX10-CU-LABEL: local_singlethread_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -70,8 +70,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -84,8 +84,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -96,8 +96,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -108,8 +108,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -120,8 +120,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -132,8 +132,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX11-WGP-LABEL: local_singlethread_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -144,8 +144,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX11-CU-LABEL: local_singlethread_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -156,8 +156,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX12-WGP-LABEL: local_singlethread_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -168,8 +168,8 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX12-CU-LABEL: local_singlethread_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -187,8 +187,8 @@ entry: define amdgpu_kernel void @local_singlethread_monotonic_load( ; GFX6-LABEL: local_singlethread_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -203,8 +203,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX7-LABEL: local_singlethread_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -217,8 +217,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -229,8 +229,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX10-CU-LABEL: local_singlethread_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -241,8 +241,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -255,8 +255,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -267,8 +267,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -279,8 +279,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -291,8 +291,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -303,8 +303,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -315,8 +315,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX11-CU-LABEL: local_singlethread_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -327,8 +327,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -339,8 +339,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX12-CU-LABEL: local_singlethread_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -358,8 +358,8 @@ entry: define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX6-LABEL: local_singlethread_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -374,8 +374,8 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX7-LABEL: local_singlethread_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -388,8 +388,8 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX10-WGP-LABEL: local_singlethread_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -400,8 +400,8 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX10-CU-LABEL: local_singlethread_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -412,8 +412,8 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -426,8 +426,8 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -438,8 +438,8 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -450,8 +450,8 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -462,8 +462,8 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -474,8 +474,8 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX11-WGP-LABEL: local_singlethread_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -486,8 +486,8 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX11-CU-LABEL: local_singlethread_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -498,8 +498,8 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX12-WGP-LABEL: local_singlethread_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -510,8 +510,8 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX12-CU-LABEL: local_singlethread_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -529,8 +529,8 @@ entry: define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX6-LABEL: local_singlethread_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -545,8 +545,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX7-LABEL: local_singlethread_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -559,8 +559,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -571,8 +571,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -583,8 +583,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -597,8 +597,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -609,8 +609,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -621,8 +621,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -633,8 +633,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -645,8 +645,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -657,8 +657,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -669,8 +669,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -681,8 +681,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -700,9 +700,9 @@ entry: define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX6-LABEL: local_singlethread_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -712,8 +712,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; GFX7-LABEL: local_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -723,8 +723,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; GFX10-WGP-LABEL: local_singlethread_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -733,8 +733,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; GFX10-CU-LABEL: local_singlethread_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -743,8 +743,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -754,8 +754,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -764,8 +764,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -774,8 +774,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -784,8 +784,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -794,8 +794,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; GFX11-WGP-LABEL: local_singlethread_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -804,8 +804,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; GFX11-CU-LABEL: local_singlethread_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -814,8 +814,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; GFX12-WGP-LABEL: local_singlethread_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -824,8 +824,8 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; GFX12-CU-LABEL: local_singlethread_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -840,9 +840,9 @@ entry: define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX6-LABEL: local_singlethread_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -852,8 +852,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; GFX7-LABEL: local_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -863,8 +863,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -873,8 +873,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; GFX10-CU-LABEL: local_singlethread_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -883,8 +883,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -894,8 +894,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -904,8 +904,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -914,8 +914,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -924,8 +924,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -934,8 +934,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -944,8 +944,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; GFX11-CU-LABEL: local_singlethread_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -954,8 +954,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -964,8 +964,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; GFX12-CU-LABEL: local_singlethread_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -980,9 +980,9 @@ entry: define amdgpu_kernel void @local_singlethread_release_store( ; GFX6-LABEL: local_singlethread_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -992,8 +992,8 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; GFX7-LABEL: local_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1003,8 +1003,8 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; GFX10-WGP-LABEL: local_singlethread_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1013,8 +1013,8 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; GFX10-CU-LABEL: local_singlethread_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1023,8 +1023,8 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1034,8 +1034,8 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1044,8 +1044,8 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1054,8 +1054,8 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1064,8 +1064,8 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1074,8 +1074,8 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; GFX11-WGP-LABEL: local_singlethread_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1084,8 +1084,8 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; GFX11-CU-LABEL: local_singlethread_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1094,8 +1094,8 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; GFX12-WGP-LABEL: local_singlethread_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1104,8 +1104,8 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; GFX12-CU-LABEL: local_singlethread_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1120,9 +1120,9 @@ entry: define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX6-LABEL: local_singlethread_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1132,8 +1132,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; GFX7-LABEL: local_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1143,8 +1143,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1153,8 +1153,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1163,8 +1163,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1174,8 +1174,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1184,8 +1184,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1194,8 +1194,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1204,8 +1204,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1214,8 +1214,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1224,8 +1224,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1234,8 +1234,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1244,8 +1244,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1260,9 +1260,9 @@ entry: define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; GFX6-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1272,8 +1272,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; GFX7-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1283,8 +1283,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1293,8 +1293,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1303,8 +1303,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1314,8 +1314,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1324,8 +1324,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1334,8 +1334,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1344,8 +1344,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1354,8 +1354,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1364,8 +1364,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1374,8 +1374,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1384,8 +1384,8 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1400,9 +1400,9 @@ entry: define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX6-LABEL: local_singlethread_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1412,8 +1412,8 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; GFX7-LABEL: local_singlethread_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1423,8 +1423,8 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1433,8 +1433,8 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1443,8 +1443,8 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1454,8 +1454,8 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1464,8 +1464,8 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1474,8 +1474,8 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1484,8 +1484,8 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1494,8 +1494,8 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1504,8 +1504,8 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1514,8 +1514,8 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1524,8 +1524,8 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1540,9 +1540,9 @@ entry: define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX6-LABEL: local_singlethread_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1552,8 +1552,8 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; GFX7-LABEL: local_singlethread_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1563,8 +1563,8 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1573,8 +1573,8 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1583,8 +1583,8 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1594,8 +1594,8 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1604,8 +1604,8 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1614,8 +1614,8 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1624,8 +1624,8 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1634,8 +1634,8 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1644,8 +1644,8 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1654,8 +1654,8 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1664,8 +1664,8 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1680,9 +1680,9 @@ entry: define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX6-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1692,8 +1692,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1703,8 +1703,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1713,8 +1713,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1723,8 +1723,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1734,8 +1734,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1744,8 +1744,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1754,8 +1754,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1764,8 +1764,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1774,8 +1774,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1784,8 +1784,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1794,8 +1794,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1804,8 +1804,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1820,9 +1820,9 @@ entry: define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX6-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1832,8 +1832,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1843,8 +1843,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1853,8 +1853,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1863,8 +1863,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1874,8 +1874,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1884,8 +1884,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1894,8 +1894,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1904,8 +1904,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1914,8 +1914,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1924,8 +1924,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1934,8 +1934,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1944,8 +1944,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1960,9 +1960,9 @@ entry: define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -1976,8 +1976,8 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; GFX7-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -1991,8 +1991,8 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2004,8 +2004,8 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2017,8 +2017,8 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2032,8 +2032,8 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2045,8 +2045,8 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2058,8 +2058,8 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2071,8 +2071,8 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2084,8 +2084,8 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2097,8 +2097,8 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2110,8 +2110,8 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2123,8 +2123,8 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2143,9 +2143,9 @@ entry: define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2159,8 +2159,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2174,8 +2174,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2187,8 +2187,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2200,8 +2200,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2215,8 +2215,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2228,8 +2228,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2241,8 +2241,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2254,8 +2254,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2267,8 +2267,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2280,8 +2280,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2293,8 +2293,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2306,8 +2306,8 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2326,9 +2326,9 @@ entry: define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2342,8 +2342,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2357,8 +2357,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2370,8 +2370,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2383,8 +2383,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2398,8 +2398,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2411,8 +2411,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2424,8 +2424,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2437,8 +2437,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2450,8 +2450,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2463,8 +2463,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2476,8 +2476,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2489,8 +2489,8 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2509,7 +2509,6 @@ entry: define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -2524,7 +2523,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -2538,7 +2536,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2551,7 +2548,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2564,7 +2560,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -2578,7 +2573,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2591,7 +2585,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2604,7 +2597,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2617,7 +2609,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2630,7 +2621,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2643,7 +2633,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2656,8 +2645,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2670,8 +2657,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2691,7 +2676,6 @@ entry: define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -2706,7 +2690,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -2720,7 +2703,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2733,7 +2715,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2746,7 +2727,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -2760,7 +2740,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2773,7 +2752,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2786,7 +2764,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2799,7 +2776,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2812,7 +2788,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2825,7 +2800,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2838,8 +2812,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2852,8 +2824,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2873,7 +2843,6 @@ entry: define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -2888,7 +2857,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -2902,7 +2870,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2915,7 +2882,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2928,7 +2894,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -2942,7 +2907,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2955,7 +2919,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2968,7 +2931,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2981,7 +2943,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2994,7 +2955,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3007,7 +2967,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3020,8 +2979,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3034,8 +2991,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3055,7 +3010,6 @@ entry: define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3070,7 +3024,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3084,7 +3037,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3097,7 +3049,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3110,7 +3061,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3124,7 +3074,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3137,7 +3086,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3150,7 +3098,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3163,7 +3110,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3176,7 +3122,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3189,7 +3134,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3202,8 +3146,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3216,8 +3158,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3237,7 +3177,6 @@ entry: define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3252,7 +3191,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3266,7 +3204,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3279,7 +3216,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3292,7 +3228,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3306,7 +3241,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3319,7 +3253,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3332,7 +3265,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3345,7 +3277,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3358,7 +3289,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3371,7 +3301,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3384,8 +3313,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3398,8 +3325,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3419,7 +3344,6 @@ entry: define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3434,7 +3358,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3448,7 +3371,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3461,7 +3383,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3474,7 +3395,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3488,7 +3408,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3501,7 +3420,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3514,7 +3432,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3527,7 +3444,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3540,7 +3456,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3553,7 +3468,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3566,8 +3480,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3580,8 +3492,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3601,7 +3511,6 @@ entry: define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3616,7 +3525,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3630,7 +3538,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3643,7 +3550,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3656,7 +3562,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3670,7 +3575,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3683,7 +3587,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3696,7 +3599,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3709,7 +3611,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3722,7 +3623,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3735,7 +3635,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3748,8 +3647,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3762,8 +3659,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3783,7 +3678,6 @@ entry: define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3798,7 +3692,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3812,7 +3705,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3825,7 +3717,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3838,7 +3729,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3852,7 +3742,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3865,7 +3754,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3878,7 +3766,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3891,7 +3778,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3904,7 +3790,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3917,7 +3802,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3930,8 +3814,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3944,8 +3826,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3965,7 +3845,6 @@ entry: define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3980,7 +3859,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3994,7 +3872,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4007,7 +3884,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4020,7 +3896,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4034,7 +3909,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4047,7 +3921,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4060,7 +3933,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4073,7 +3945,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4086,7 +3957,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4099,7 +3969,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4112,8 +3981,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4126,8 +3993,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4147,7 +4012,6 @@ entry: define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4162,7 +4026,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4176,7 +4039,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4189,7 +4051,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4202,7 +4063,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4216,7 +4076,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4229,7 +4088,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4242,7 +4100,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4255,7 +4112,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4268,7 +4124,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4281,7 +4136,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4294,8 +4148,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4308,8 +4160,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4329,7 +4179,6 @@ entry: define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4344,7 +4193,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4358,7 +4206,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4371,7 +4218,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4384,7 +4230,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4398,7 +4243,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4411,7 +4255,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4424,7 +4267,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4437,7 +4279,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4450,7 +4291,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4463,7 +4303,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4476,8 +4315,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4490,8 +4327,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4511,7 +4346,6 @@ entry: define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4526,7 +4360,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4540,7 +4373,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4553,7 +4385,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4566,7 +4397,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4580,7 +4410,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4593,7 +4422,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4606,7 +4434,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4619,7 +4446,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4632,7 +4458,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4645,7 +4470,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4658,8 +4482,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4672,8 +4494,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4693,7 +4513,6 @@ entry: define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4708,7 +4527,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4722,7 +4540,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4735,7 +4552,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4748,7 +4564,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4762,7 +4577,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4775,7 +4589,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4788,7 +4601,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4801,7 +4613,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4814,7 +4625,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4827,7 +4637,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4840,8 +4649,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4854,8 +4661,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4875,7 +4680,6 @@ entry: define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4890,7 +4694,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4904,7 +4707,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4917,7 +4719,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4930,7 +4731,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4944,7 +4744,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4957,7 +4756,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4970,7 +4768,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4983,7 +4780,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4996,7 +4792,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5009,7 +4804,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5022,8 +4816,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5036,8 +4828,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5057,7 +4847,6 @@ entry: define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5072,7 +4861,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5086,7 +4874,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5099,7 +4886,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5112,7 +4898,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5126,7 +4911,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5139,7 +4923,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5152,7 +4935,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5165,7 +4947,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5178,7 +4959,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5191,7 +4971,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5204,8 +4983,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5218,8 +4995,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5239,10 +5014,10 @@ entry: define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5257,9 +5032,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5274,10 +5049,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -5290,10 +5064,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -5306,9 +5079,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5323,10 +5096,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5339,10 +5111,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5355,10 +5126,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5371,10 +5141,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5387,7 +5156,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5403,7 +5171,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5419,8 +5186,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5436,8 +5201,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5462,10 +5225,10 @@ entry: define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5480,9 +5243,9 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5497,10 +5260,9 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -5513,10 +5275,9 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -5529,9 +5290,9 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5546,10 +5307,9 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5562,10 +5322,9 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5578,10 +5337,9 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5594,10 +5352,9 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5610,7 +5367,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5626,7 +5382,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5642,8 +5397,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5659,8 +5412,6 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5685,10 +5436,10 @@ entry: define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5703,9 +5454,9 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5720,10 +5471,9 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -5736,10 +5486,9 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -5752,9 +5501,9 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5769,10 +5518,9 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5785,10 +5533,9 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5801,10 +5548,9 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5817,10 +5563,9 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5833,7 +5578,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5849,7 +5593,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5865,8 +5608,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5882,8 +5623,6 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5908,10 +5647,10 @@ entry: define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5926,9 +5665,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5943,10 +5682,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -5959,10 +5697,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -5975,9 +5712,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5992,10 +5729,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6008,10 +5744,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6024,10 +5759,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6040,10 +5774,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6056,7 +5789,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6072,7 +5804,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6088,8 +5819,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6105,8 +5834,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6131,10 +5858,10 @@ entry: define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6149,9 +5876,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6166,10 +5893,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6182,10 +5908,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6198,9 +5923,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6215,10 +5940,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6231,10 +5955,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6247,10 +5970,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6263,10 +5985,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6279,7 +6000,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6295,7 +6015,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6311,8 +6030,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6328,8 +6045,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6354,10 +6069,10 @@ entry: define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6372,9 +6087,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6389,10 +6104,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6405,10 +6119,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6421,9 +6134,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6438,10 +6151,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6454,10 +6166,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6470,10 +6181,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6486,10 +6196,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6502,7 +6211,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6518,7 +6226,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6534,8 +6241,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6551,8 +6256,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6577,10 +6280,10 @@ entry: define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6595,9 +6298,9 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6612,10 +6315,9 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6628,10 +6330,9 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6644,9 +6345,9 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6661,10 +6362,9 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6677,10 +6377,9 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6693,10 +6392,9 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6709,10 +6407,9 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6725,7 +6422,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6741,7 +6437,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6757,8 +6452,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6774,8 +6467,6 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6800,10 +6491,10 @@ entry: define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6818,9 +6509,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6835,10 +6526,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6851,10 +6541,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6867,9 +6556,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6884,10 +6573,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6900,10 +6588,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6916,10 +6603,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6932,10 +6618,9 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6948,7 +6633,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6964,7 +6648,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6980,8 +6663,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6997,8 +6678,6 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7023,10 +6702,10 @@ entry: define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7041,9 +6720,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7058,10 +6737,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7074,10 +6752,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7090,9 +6767,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7107,10 +6784,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7123,10 +6799,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7139,10 +6814,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7155,10 +6829,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7171,7 +6844,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7187,7 +6859,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7203,8 +6874,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7220,8 +6889,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7246,10 +6913,10 @@ entry: define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7264,9 +6931,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7281,10 +6948,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7297,10 +6963,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7313,9 +6978,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7330,10 +6995,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7346,10 +7010,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7362,10 +7025,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7378,10 +7040,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7394,7 +7055,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7410,7 +7070,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7426,8 +7085,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7443,8 +7100,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7469,10 +7124,10 @@ entry: define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7487,9 +7142,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7504,10 +7159,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7520,10 +7174,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7536,9 +7189,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7553,10 +7206,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7569,10 +7221,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7585,10 +7236,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7601,10 +7251,9 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7617,7 +7266,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7633,7 +7281,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7649,8 +7296,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7666,8 +7311,6 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7692,10 +7335,10 @@ entry: define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7710,9 +7353,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7727,10 +7370,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7743,10 +7385,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7759,9 +7400,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7776,10 +7417,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7792,10 +7432,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7808,10 +7447,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7824,10 +7462,9 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7840,7 +7477,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7856,7 +7492,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7872,8 +7507,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7889,8 +7522,6 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7915,10 +7546,10 @@ entry: define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7933,9 +7564,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7950,10 +7581,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7966,10 +7596,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7982,9 +7611,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7999,10 +7628,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8015,10 +7643,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8031,10 +7658,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8047,10 +7673,9 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8063,7 +7688,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8079,7 +7703,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8095,8 +7718,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8112,8 +7733,6 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8138,10 +7757,10 @@ entry: define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8156,9 +7775,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8173,10 +7792,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8189,10 +7807,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8205,9 +7822,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8222,10 +7839,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8238,10 +7854,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8254,10 +7869,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8270,10 +7884,9 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8286,7 +7899,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8302,7 +7914,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8318,8 +7929,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8335,8 +7944,6 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8361,10 +7968,10 @@ entry: define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8379,9 +7986,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8396,10 +8003,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8412,10 +8018,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8428,9 +8033,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8445,10 +8050,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8461,10 +8065,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8477,10 +8080,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8493,10 +8095,9 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8509,7 +8110,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8525,7 +8125,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8541,8 +8140,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8558,8 +8155,6 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8584,8 +8179,8 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; GFX6-LABEL: local_singlethread_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -8600,8 +8195,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX7-LABEL: local_singlethread_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -8614,8 +8209,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -8626,8 +8221,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX10-CU-LABEL: local_singlethread_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -8638,8 +8233,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -8652,8 +8247,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8664,8 +8259,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8676,8 +8271,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8688,8 +8283,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8700,8 +8295,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -8712,8 +8307,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX11-CU-LABEL: local_singlethread_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -8724,8 +8319,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -8736,8 +8331,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX12-CU-LABEL: local_singlethread_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -8755,8 +8350,8 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; GFX6-LABEL: local_singlethread_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -8771,8 +8366,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -8785,8 +8380,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -8797,8 +8392,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -8809,8 +8404,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -8823,8 +8418,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8835,8 +8430,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8847,8 +8442,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8859,8 +8454,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8871,8 +8466,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -8883,8 +8478,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -8895,8 +8490,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -8907,8 +8502,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -8926,8 +8521,8 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX6-LABEL: local_singlethread_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -8942,8 +8537,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX7-LABEL: local_singlethread_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -8956,8 +8551,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -8968,8 +8563,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -8980,8 +8575,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -8994,8 +8589,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9006,8 +8601,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9018,8 +8613,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9030,8 +8625,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9042,8 +8637,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9054,8 +8649,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9066,8 +8661,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9078,8 +8673,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9097,8 +8692,8 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9113,8 +8708,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9127,8 +8722,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -9139,8 +8734,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -9151,8 +8746,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9165,8 +8760,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9177,8 +8772,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9189,8 +8784,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9201,8 +8796,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9213,8 +8808,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9225,8 +8820,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9237,8 +8832,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9249,8 +8844,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9268,9 +8863,9 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX6-LABEL: local_singlethread_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -9280,8 +8875,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; GFX7-LABEL: local_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9291,8 +8886,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -9301,8 +8896,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; GFX10-CU-LABEL: local_singlethread_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -9311,8 +8906,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9322,8 +8917,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9332,8 +8927,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9342,8 +8937,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9352,8 +8947,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9362,8 +8957,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9372,8 +8967,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; GFX11-CU-LABEL: local_singlethread_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9382,8 +8977,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9392,8 +8987,8 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; GFX12-CU-LABEL: local_singlethread_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9408,9 +9003,9 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX6-LABEL: local_singlethread_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -9420,8 +9015,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9431,8 +9026,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -9441,8 +9036,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -9451,8 +9046,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9462,8 +9057,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9472,8 +9067,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9482,8 +9077,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9492,8 +9087,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9502,8 +9097,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9512,8 +9107,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9522,8 +9117,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9532,8 +9127,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9548,9 +9143,9 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX6-LABEL: local_singlethread_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -9560,8 +9155,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; GFX7-LABEL: local_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9571,8 +9166,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -9581,8 +9176,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -9591,8 +9186,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9602,8 +9197,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9612,8 +9207,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9622,8 +9217,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9632,8 +9227,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9642,8 +9237,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9652,8 +9247,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9662,8 +9257,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9672,8 +9267,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; GFX12-CU-LABEL: local_singlethread_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9688,9 +9283,9 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -9700,8 +9295,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9711,8 +9306,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -9721,8 +9316,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -9731,8 +9326,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9742,8 +9337,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9752,8 +9347,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9762,8 +9357,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9772,8 +9367,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9782,8 +9377,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9792,8 +9387,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9802,8 +9397,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9812,8 +9407,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9828,9 +9423,9 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -9840,8 +9435,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9851,8 +9446,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -9861,8 +9456,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -9871,8 +9466,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9882,8 +9477,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9892,8 +9487,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9902,8 +9497,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9912,8 +9507,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9922,8 +9517,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9932,8 +9527,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9942,8 +9537,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9952,8 +9547,8 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9968,9 +9563,9 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -9980,8 +9575,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9991,8 +9586,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10001,8 +9596,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10011,8 +9606,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10022,8 +9617,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10032,8 +9627,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10042,8 +9637,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10052,8 +9647,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10062,8 +9657,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10072,8 +9667,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10082,8 +9677,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10092,8 +9687,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10108,9 +9703,9 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10120,8 +9715,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; GFX7-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10131,8 +9726,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10141,8 +9736,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10151,8 +9746,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10162,8 +9757,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10172,8 +9767,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10182,8 +9777,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10192,8 +9787,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10202,8 +9797,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10212,8 +9807,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10222,8 +9817,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10232,8 +9827,8 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10248,9 +9843,9 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10260,8 +9855,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10271,8 +9866,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10281,8 +9876,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10291,8 +9886,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10302,8 +9897,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10312,8 +9907,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10322,8 +9917,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10332,8 +9927,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10342,8 +9937,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10352,8 +9947,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10362,8 +9957,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10372,8 +9967,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10388,9 +9983,9 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10400,8 +9995,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10411,8 +10006,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10421,8 +10016,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10431,8 +10026,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10442,8 +10037,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10452,8 +10047,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10462,8 +10057,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10472,8 +10067,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10482,8 +10077,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10492,8 +10087,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10502,8 +10097,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10512,8 +10107,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10528,9 +10123,9 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -10544,8 +10139,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -10559,8 +10154,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -10572,8 +10167,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -10585,8 +10180,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -10600,8 +10195,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -10613,8 +10208,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -10626,8 +10221,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -10639,8 +10234,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -10652,8 +10247,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -10665,8 +10260,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -10678,8 +10273,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -10691,8 +10286,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -10711,9 +10306,9 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -10727,8 +10322,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -10742,8 +10337,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -10755,8 +10350,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -10768,8 +10363,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -10783,8 +10378,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -10796,8 +10391,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -10809,8 +10404,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -10822,8 +10417,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -10835,8 +10430,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -10848,8 +10443,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -10861,8 +10456,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -10874,8 +10469,8 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -10894,9 +10489,9 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -10910,8 +10505,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -10925,8 +10520,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -10938,8 +10533,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -10951,8 +10546,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -10966,8 +10561,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -10979,8 +10574,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -10992,8 +10587,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11005,8 +10600,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11018,8 +10613,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11031,8 +10626,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11044,8 +10639,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11057,8 +10652,8 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11077,7 +10672,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11092,7 +10686,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11106,7 +10699,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11119,7 +10711,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11132,7 +10723,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -11146,7 +10736,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11159,7 +10748,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11172,7 +10760,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11185,7 +10772,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11198,7 +10784,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11211,7 +10796,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11224,8 +10808,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11238,8 +10820,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11259,7 +10839,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11274,7 +10853,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11288,7 +10866,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11301,7 +10878,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11314,7 +10890,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -11328,7 +10903,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11341,7 +10915,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11354,7 +10927,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11367,7 +10939,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11380,7 +10951,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11393,7 +10963,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11406,8 +10975,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11420,8 +10987,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11441,7 +11006,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11456,7 +11020,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11470,7 +11033,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11483,7 +11045,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11496,7 +11057,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -11510,7 +11070,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11523,7 +11082,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11536,7 +11094,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11549,7 +11106,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11562,7 +11118,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11575,7 +11130,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11588,8 +11142,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11602,8 +11154,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11623,7 +11173,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11638,7 +11187,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11652,7 +11200,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11665,7 +11212,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11678,7 +11224,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -11692,7 +11237,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11705,7 +11249,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11718,7 +11261,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11731,7 +11273,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11744,7 +11285,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11757,7 +11297,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11770,8 +11309,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11784,8 +11321,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11805,7 +11340,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11820,7 +11354,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11834,7 +11367,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11847,7 +11379,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11860,7 +11391,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -11874,7 +11404,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11887,7 +11416,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11900,7 +11428,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11913,7 +11440,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11926,7 +11452,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11939,7 +11464,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11952,8 +11476,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11966,8 +11488,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11987,7 +11507,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12002,7 +11521,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12016,7 +11534,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12029,7 +11546,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12042,7 +11558,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12056,7 +11571,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12069,7 +11583,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12082,7 +11595,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12095,7 +11607,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12108,7 +11619,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12121,7 +11631,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12134,8 +11643,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12148,8 +11655,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12169,7 +11674,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12184,7 +11688,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12198,7 +11701,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12211,7 +11713,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12224,7 +11725,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12238,7 +11738,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12251,7 +11750,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12264,7 +11762,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12277,7 +11774,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12290,7 +11786,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12303,7 +11798,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12316,8 +11810,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12330,8 +11822,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12351,7 +11841,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12366,7 +11855,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12380,7 +11868,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12393,7 +11880,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12406,7 +11892,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12420,7 +11905,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12433,7 +11917,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12446,7 +11929,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12459,7 +11941,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12472,7 +11953,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12485,7 +11965,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12498,8 +11977,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12512,8 +11989,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12533,7 +12008,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12548,7 +12022,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12562,7 +12035,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12575,7 +12047,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12588,7 +12059,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12602,7 +12072,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12615,7 +12084,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12628,7 +12096,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12641,7 +12108,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12654,7 +12120,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12667,7 +12132,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12680,8 +12144,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12694,8 +12156,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12715,7 +12175,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12730,7 +12189,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12744,7 +12202,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12757,7 +12214,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12770,7 +12226,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12784,7 +12239,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12797,7 +12251,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12810,7 +12263,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12823,7 +12275,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12836,7 +12287,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12849,7 +12299,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12862,8 +12311,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12876,8 +12323,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12897,7 +12342,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12912,7 +12356,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12926,7 +12369,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12939,7 +12381,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12952,7 +12393,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12966,7 +12406,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12979,7 +12418,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12992,7 +12430,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13005,7 +12442,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13018,7 +12454,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13031,7 +12466,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13044,8 +12478,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13058,8 +12490,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13079,7 +12509,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13094,7 +12523,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13108,7 +12536,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13121,7 +12548,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13134,7 +12560,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13148,7 +12573,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13161,7 +12585,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13174,7 +12597,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13187,7 +12609,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13200,7 +12621,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13213,7 +12633,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13226,8 +12645,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13240,8 +12657,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13261,7 +12676,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13276,7 +12690,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13290,7 +12703,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13303,7 +12715,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13316,7 +12727,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13330,7 +12740,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13343,7 +12752,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13356,7 +12764,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13369,7 +12776,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13382,7 +12788,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13395,7 +12800,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13408,8 +12812,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13422,8 +12824,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13443,7 +12843,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13458,7 +12857,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13472,7 +12870,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13485,7 +12882,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13498,7 +12894,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13512,7 +12907,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13525,7 +12919,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13538,7 +12931,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13551,7 +12943,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13564,7 +12955,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13577,7 +12967,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13590,8 +12979,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13604,8 +12991,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13625,7 +13010,6 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13640,7 +13024,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13654,7 +13037,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13667,7 +13049,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13680,7 +13061,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13694,7 +13074,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13707,7 +13086,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13720,7 +13098,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13733,7 +13110,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13746,7 +13122,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13759,7 +13134,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13772,8 +13146,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13786,8 +13158,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13807,10 +13177,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -13825,9 +13195,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13842,10 +13212,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -13858,10 +13227,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -13874,9 +13242,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13891,10 +13259,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -13907,10 +13274,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -13923,10 +13289,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -13939,10 +13304,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -13955,7 +13319,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -13971,7 +13334,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -13987,8 +13349,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14004,8 +13364,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14030,10 +13388,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14048,9 +13406,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14065,10 +13423,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14081,10 +13438,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14097,9 +13453,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14114,10 +13470,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14130,10 +13485,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14146,10 +13500,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14162,10 +13515,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14178,7 +13530,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14194,7 +13545,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14210,8 +13560,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14227,8 +13575,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14253,10 +13599,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14271,9 +13617,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; GFX7-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14288,10 +13634,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14304,10 +13649,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14320,9 +13664,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14337,10 +13681,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14353,10 +13696,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14369,10 +13711,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14385,10 +13726,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14401,7 +13741,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14417,7 +13756,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14433,8 +13771,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14450,8 +13786,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14476,10 +13810,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14494,9 +13828,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,10 +13845,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14527,10 +13860,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14543,9 +13875,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14560,10 +13892,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14576,10 +13907,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14592,10 +13922,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14608,10 +13937,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14624,7 +13952,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14640,7 +13967,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14656,8 +13982,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14673,8 +13997,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14699,10 +14021,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14717,9 +14039,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14734,10 +14056,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14750,10 +14071,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14766,9 +14086,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14783,10 +14103,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14799,10 +14118,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14815,10 +14133,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14831,10 +14148,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14847,7 +14163,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14863,7 +14178,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14879,8 +14193,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14896,8 +14208,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14922,10 +14232,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14940,9 +14250,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14957,10 +14267,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14973,10 +14282,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14989,9 +14297,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15006,10 +14314,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15022,10 +14329,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15038,10 +14344,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15054,10 +14359,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15070,7 +14374,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15086,7 +14389,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15102,8 +14404,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15119,8 +14419,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15145,10 +14443,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15163,9 +14461,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15180,10 +14478,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15196,10 +14493,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15212,9 +14508,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15229,10 +14525,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15245,10 +14540,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15261,10 +14555,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15277,10 +14570,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15293,7 +14585,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15309,7 +14600,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15325,8 +14615,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15342,8 +14630,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15368,10 +14654,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15386,9 +14672,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; GFX7-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15403,10 +14689,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15419,10 +14704,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15435,9 +14719,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15452,10 +14736,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15468,10 +14751,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15484,10 +14766,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15500,10 +14781,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15516,7 +14796,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15532,7 +14811,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15548,8 +14826,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15565,8 +14841,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15591,10 +14865,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15609,9 +14883,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15626,10 +14900,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15642,10 +14915,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15658,9 +14930,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15675,10 +14947,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15691,10 +14962,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15707,10 +14977,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15723,10 +14992,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15739,7 +15007,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15755,7 +15022,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15771,8 +15037,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15788,8 +15052,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15814,10 +15076,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15832,9 +15094,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15849,10 +15111,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15865,10 +15126,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15881,9 +15141,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15898,10 +15158,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15914,10 +15173,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15930,10 +15188,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15946,10 +15203,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15962,7 +15218,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15978,7 +15233,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15994,8 +15248,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16011,8 +15263,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16037,10 +15287,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16055,9 +15305,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; GFX7-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16072,10 +15322,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16088,10 +15337,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16104,9 +15352,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16121,10 +15369,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16137,10 +15384,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16153,10 +15399,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16169,10 +15414,9 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16185,7 +15429,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16201,7 +15444,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16217,8 +15459,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16234,8 +15474,6 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16260,10 +15498,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16278,9 +15516,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16295,10 +15533,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16311,10 +15548,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; GFX10-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16327,9 +15563,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16344,10 +15580,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16360,10 +15595,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16376,10 +15610,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16392,10 +15625,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16408,7 +15640,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16424,7 +15655,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; GFX11-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16440,8 +15670,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16457,8 +15685,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16483,10 +15709,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16501,9 +15727,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16518,10 +15744,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; GFX10-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16534,10 +15759,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; GFX10-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16550,9 +15774,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16567,10 +15791,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16583,10 +15806,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16599,10 +15821,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16615,10 +15836,9 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16631,7 +15851,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; GFX11-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16647,7 +15866,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; GFX11-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16663,8 +15881,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16680,8 +15896,6 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16706,10 +15920,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16724,9 +15938,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16741,10 +15955,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16757,10 +15970,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16773,9 +15985,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16790,10 +16002,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16806,10 +16017,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16822,10 +16032,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16838,10 +16047,9 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16854,7 +16062,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16870,7 +16077,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16886,8 +16092,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16903,8 +16107,6 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16929,10 +16131,10 @@ entry: define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16947,9 +16149,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16964,10 +16166,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16980,10 +16181,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16996,9 +16196,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17013,10 +16213,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17029,10 +16228,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17045,10 +16243,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17061,10 +16258,9 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17077,7 +16273,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17093,7 +16288,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17109,8 +16303,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17126,8 +16318,6 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 3a337bc74282a6..f2e3e7bf417687 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -16,8 +16,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; GFX6-LABEL: local_system_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -32,8 +32,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX7-LABEL: local_system_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -46,8 +46,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX10-WGP-LABEL: local_system_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -58,8 +58,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX10-CU-LABEL: local_system_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -70,8 +70,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; SKIP-CACHE-INV-LABEL: local_system_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -84,8 +84,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -96,8 +96,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: local_system_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -108,8 +108,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -120,8 +120,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX940-TGSPLIT-LABEL: local_system_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -132,8 +132,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX11-WGP-LABEL: local_system_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -144,8 +144,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX11-CU-LABEL: local_system_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -156,8 +156,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX12-WGP-LABEL: local_system_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -168,8 +168,8 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX12-CU-LABEL: local_system_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -187,8 +187,8 @@ entry: define amdgpu_kernel void @local_system_monotonic_load( ; GFX6-LABEL: local_system_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -203,8 +203,8 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX7-LABEL: local_system_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -217,8 +217,8 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX10-WGP-LABEL: local_system_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -229,8 +229,8 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX10-CU-LABEL: local_system_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -241,8 +241,8 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -255,8 +255,8 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -267,8 +267,8 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -279,8 +279,8 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -291,8 +291,8 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: local_system_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -303,8 +303,8 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX11-WGP-LABEL: local_system_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -315,8 +315,8 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX11-CU-LABEL: local_system_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -327,8 +327,8 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX12-WGP-LABEL: local_system_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -339,8 +339,8 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX12-CU-LABEL: local_system_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -358,8 +358,8 @@ entry: define amdgpu_kernel void @local_system_acquire_load( ; GFX6-LABEL: local_system_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -374,8 +374,8 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX7-LABEL: local_system_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -388,8 +388,8 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX10-WGP-LABEL: local_system_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -401,8 +401,8 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX10-CU-LABEL: local_system_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -413,8 +413,8 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -427,8 +427,8 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -439,8 +439,8 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -452,8 +452,8 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -464,8 +464,8 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX940-TGSPLIT-LABEL: local_system_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -477,8 +477,8 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX11-WGP-LABEL: local_system_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -490,8 +490,8 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX11-CU-LABEL: local_system_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -502,8 +502,8 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX12-WGP-LABEL: local_system_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -515,8 +515,8 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX12-CU-LABEL: local_system_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -534,8 +534,8 @@ entry: define amdgpu_kernel void @local_system_seq_cst_load( ; GFX6-LABEL: local_system_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -551,8 +551,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX7-LABEL: local_system_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -566,8 +566,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX10-WGP-LABEL: local_system_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -581,8 +581,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX10-CU-LABEL: local_system_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -594,8 +594,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -609,8 +609,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -622,8 +622,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -636,8 +636,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -649,8 +649,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -663,8 +663,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX11-WGP-LABEL: local_system_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -678,8 +678,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX11-CU-LABEL: local_system_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -691,8 +691,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX12-WGP-LABEL: local_system_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -708,8 +708,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX12-CU-LABEL: local_system_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -728,9 +728,9 @@ entry: define amdgpu_kernel void @local_system_unordered_store( ; GFX6-LABEL: local_system_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -740,8 +740,8 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; GFX7-LABEL: local_system_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -751,8 +751,8 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; GFX10-WGP-LABEL: local_system_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -761,8 +761,8 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; GFX10-CU-LABEL: local_system_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -771,8 +771,8 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; SKIP-CACHE-INV-LABEL: local_system_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -782,8 +782,8 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -792,8 +792,8 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: local_system_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -802,8 +802,8 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -812,8 +812,8 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; GFX940-TGSPLIT-LABEL: local_system_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -822,8 +822,8 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; GFX11-WGP-LABEL: local_system_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -832,8 +832,8 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; GFX11-CU-LABEL: local_system_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -842,8 +842,8 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; GFX12-WGP-LABEL: local_system_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -852,8 +852,8 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; GFX12-CU-LABEL: local_system_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -868,9 +868,9 @@ entry: define amdgpu_kernel void @local_system_monotonic_store( ; GFX6-LABEL: local_system_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -880,8 +880,8 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; GFX7-LABEL: local_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -891,8 +891,8 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; GFX10-WGP-LABEL: local_system_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -901,8 +901,8 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; GFX10-CU-LABEL: local_system_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -911,8 +911,8 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -922,8 +922,8 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -932,8 +932,8 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -942,8 +942,8 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -952,8 +952,8 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: local_system_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -962,8 +962,8 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; GFX11-WGP-LABEL: local_system_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -972,8 +972,8 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; GFX11-CU-LABEL: local_system_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -982,8 +982,8 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; GFX12-WGP-LABEL: local_system_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -992,8 +992,8 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; GFX12-CU-LABEL: local_system_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1008,9 +1008,9 @@ entry: define amdgpu_kernel void @local_system_release_store( ; GFX6-LABEL: local_system_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1021,8 +1021,8 @@ define amdgpu_kernel void @local_system_release_store( ; ; GFX7-LABEL: local_system_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1033,8 +1033,8 @@ define amdgpu_kernel void @local_system_release_store( ; ; GFX10-WGP-LABEL: local_system_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1045,8 +1045,8 @@ define amdgpu_kernel void @local_system_release_store( ; ; GFX10-CU-LABEL: local_system_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1056,8 +1056,8 @@ define amdgpu_kernel void @local_system_release_store( ; ; SKIP-CACHE-INV-LABEL: local_system_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1068,8 +1068,8 @@ define amdgpu_kernel void @local_system_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1079,8 +1079,8 @@ define amdgpu_kernel void @local_system_release_store( ; ; GFX90A-TGSPLIT-LABEL: local_system_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1090,8 +1090,8 @@ define amdgpu_kernel void @local_system_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1101,8 +1101,8 @@ define amdgpu_kernel void @local_system_release_store( ; ; GFX940-TGSPLIT-LABEL: local_system_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1112,8 +1112,8 @@ define amdgpu_kernel void @local_system_release_store( ; ; GFX11-WGP-LABEL: local_system_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1124,8 +1124,8 @@ define amdgpu_kernel void @local_system_release_store( ; ; GFX11-CU-LABEL: local_system_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1135,8 +1135,8 @@ define amdgpu_kernel void @local_system_release_store( ; ; GFX12-WGP-LABEL: local_system_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1149,8 +1149,8 @@ define amdgpu_kernel void @local_system_release_store( ; ; GFX12-CU-LABEL: local_system_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1166,9 +1166,9 @@ entry: define amdgpu_kernel void @local_system_seq_cst_store( ; GFX6-LABEL: local_system_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1179,8 +1179,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; GFX7-LABEL: local_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1191,8 +1191,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; GFX10-WGP-LABEL: local_system_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1203,8 +1203,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; GFX10-CU-LABEL: local_system_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1214,8 +1214,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1226,8 +1226,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1237,8 +1237,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1248,8 +1248,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1259,8 +1259,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1270,8 +1270,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; GFX11-WGP-LABEL: local_system_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1282,8 +1282,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; GFX11-CU-LABEL: local_system_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1293,8 +1293,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; GFX12-WGP-LABEL: local_system_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1307,8 +1307,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; GFX12-CU-LABEL: local_system_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1324,9 +1324,9 @@ entry: define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; GFX6-LABEL: local_system_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1336,8 +1336,8 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; GFX7-LABEL: local_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1347,8 +1347,8 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1357,8 +1357,8 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: local_system_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1367,8 +1367,8 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1378,8 +1378,8 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1388,8 +1388,8 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1398,8 +1398,8 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1408,8 +1408,8 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1418,8 +1418,8 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1428,8 +1428,8 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: local_system_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1438,8 +1438,8 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1448,8 +1448,8 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: local_system_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1464,9 +1464,9 @@ entry: define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX6-LABEL: local_system_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1477,8 +1477,8 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX7-LABEL: local_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1489,8 +1489,8 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1501,8 +1501,8 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_system_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1512,8 +1512,8 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1524,8 +1524,8 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1535,8 +1535,8 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1546,8 +1546,8 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1557,8 +1557,8 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1568,8 +1568,8 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1580,8 +1580,8 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_system_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1591,8 +1591,8 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1603,8 +1603,8 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_system_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1620,9 +1620,9 @@ entry: define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX6-LABEL: local_system_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1633,8 +1633,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX7-LABEL: local_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1645,8 +1645,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1657,8 +1657,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX10-CU-LABEL: local_system_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1668,8 +1668,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1680,8 +1680,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1691,8 +1691,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1702,8 +1702,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1713,8 +1713,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1724,8 +1724,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1736,8 +1736,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX11-CU-LABEL: local_system_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1747,8 +1747,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1761,8 +1761,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX12-CU-LABEL: local_system_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1778,9 +1778,9 @@ entry: define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX6-LABEL: local_system_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1792,8 +1792,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1805,8 +1805,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1819,8 +1819,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_system_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1831,8 +1831,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1844,8 +1844,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1856,8 +1856,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1868,8 +1868,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1880,8 +1880,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1892,8 +1892,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1906,8 +1906,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_system_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1918,8 +1918,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1934,8 +1934,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_system_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1952,9 +1952,9 @@ entry: define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX6-LABEL: local_system_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1966,8 +1966,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1979,8 +1979,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1993,8 +1993,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_system_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -2005,8 +2005,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -2018,8 +2018,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -2030,8 +2030,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -2042,8 +2042,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -2054,8 +2054,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -2066,8 +2066,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -2080,8 +2080,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_system_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -2092,8 +2092,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -2108,8 +2108,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_system_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -2126,9 +2126,9 @@ entry: define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX6-LABEL: local_system_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2142,8 +2142,8 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; GFX7-LABEL: local_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2157,8 +2157,8 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2171,8 +2171,8 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_system_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2184,8 +2184,8 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2199,8 +2199,8 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2212,8 +2212,8 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2226,8 +2226,8 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2239,8 +2239,8 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2253,8 +2253,8 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2267,8 +2267,8 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_system_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2280,8 +2280,8 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2294,8 +2294,8 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_system_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2314,9 +2314,9 @@ entry: define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2331,8 +2331,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2347,8 +2347,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2363,8 +2363,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2377,8 +2377,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2393,8 +2393,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2407,8 +2407,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2422,8 +2422,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2436,8 +2436,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2451,8 +2451,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2467,8 +2467,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2481,8 +2481,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2499,8 +2499,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2520,9 +2520,9 @@ entry: define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2537,8 +2537,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2553,8 +2553,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2569,8 +2569,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2583,8 +2583,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2599,8 +2599,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2613,8 +2613,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2628,8 +2628,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2642,8 +2642,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2657,8 +2657,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2673,8 +2673,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2687,8 +2687,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2705,8 +2705,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2726,7 +2726,6 @@ entry: define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -2741,7 +2740,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -2755,7 +2753,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2768,7 +2765,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2781,7 +2777,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -2795,7 +2790,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2808,7 +2802,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2821,7 +2814,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2834,7 +2826,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2847,7 +2838,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2860,7 +2850,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2873,8 +2862,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2887,8 +2874,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2908,7 +2893,6 @@ entry: define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -2924,7 +2908,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -2939,7 +2922,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2954,7 +2936,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2968,7 +2949,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -2983,7 +2963,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2997,7 +2976,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3011,7 +2989,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3025,7 +3002,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3039,7 +3015,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3054,7 +3029,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3068,8 +3042,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3084,8 +3056,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3106,7 +3076,6 @@ entry: define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX6-LABEL: local_system_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3122,7 +3091,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3137,7 +3105,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3152,7 +3119,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3166,7 +3132,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3181,7 +3146,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3195,7 +3159,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3209,7 +3172,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3223,7 +3185,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3237,7 +3198,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3252,7 +3212,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3266,8 +3225,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3284,8 +3241,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3306,7 +3261,6 @@ entry: define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3323,7 +3277,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3339,7 +3292,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3356,7 +3308,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3371,7 +3322,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3387,7 +3337,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3402,7 +3351,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3417,7 +3365,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3432,7 +3379,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3447,7 +3393,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3464,7 +3409,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3479,8 +3423,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3499,8 +3441,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3522,7 +3462,6 @@ entry: define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3539,7 +3478,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3555,7 +3493,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3572,7 +3509,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3587,7 +3523,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3603,7 +3538,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3618,7 +3552,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3633,7 +3566,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3648,7 +3580,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3663,7 +3594,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3680,7 +3610,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3695,8 +3624,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3715,8 +3642,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3738,7 +3663,6 @@ entry: define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3754,7 +3678,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3769,7 +3692,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3784,7 +3706,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3798,7 +3719,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3813,7 +3733,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3827,7 +3746,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3841,7 +3759,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3855,7 +3772,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3869,7 +3785,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3884,7 +3799,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3898,8 +3812,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3914,8 +3826,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3936,7 +3846,6 @@ entry: define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3952,7 +3861,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3967,7 +3875,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3982,7 +3889,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3996,7 +3902,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4011,7 +3916,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4025,7 +3929,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4039,7 +3942,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4053,7 +3955,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4067,7 +3968,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4082,7 +3982,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4096,8 +3995,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4112,8 +4009,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4134,7 +4029,6 @@ entry: define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX6-LABEL: local_system_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4151,7 +4045,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4167,7 +4060,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4184,7 +4076,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4199,7 +4090,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4215,7 +4105,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4230,7 +4119,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4245,7 +4133,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4260,7 +4147,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4275,7 +4161,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4292,7 +4177,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4307,8 +4191,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4327,8 +4209,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4350,7 +4230,6 @@ entry: define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4367,7 +4246,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4383,7 +4261,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4400,7 +4277,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4415,7 +4291,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4431,7 +4306,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4446,7 +4320,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4461,7 +4334,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4476,7 +4348,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4491,7 +4362,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4508,7 +4378,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4523,8 +4392,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4543,8 +4410,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4566,7 +4431,6 @@ entry: define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4583,7 +4447,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4599,7 +4462,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4616,7 +4478,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4631,7 +4492,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4647,7 +4507,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4662,7 +4521,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4677,7 +4535,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4692,7 +4549,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4707,7 +4563,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4724,7 +4579,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4739,8 +4593,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4759,8 +4611,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4782,7 +4632,6 @@ entry: define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4799,7 +4648,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4815,7 +4663,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4832,7 +4679,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4847,7 +4693,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4863,7 +4708,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4878,7 +4722,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4893,7 +4736,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4908,7 +4750,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4923,7 +4764,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4940,7 +4780,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4955,8 +4794,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4975,8 +4812,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4998,7 +4833,6 @@ entry: define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5015,7 +4849,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5031,7 +4864,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5048,7 +4880,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5063,7 +4894,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5079,7 +4909,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5094,7 +4923,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5109,7 +4937,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5124,7 +4951,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5139,7 +4965,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5156,7 +4981,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5171,8 +4995,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5191,8 +5013,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5214,7 +5034,6 @@ entry: define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5231,7 +5050,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5247,7 +5065,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5264,7 +5081,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5279,7 +5095,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5295,7 +5110,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5310,7 +5124,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5325,7 +5138,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5340,7 +5152,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5355,7 +5166,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5372,7 +5182,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5387,8 +5196,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5407,8 +5214,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5430,7 +5235,6 @@ entry: define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5447,7 +5251,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5463,7 +5266,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5480,7 +5282,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5495,7 +5296,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5511,7 +5311,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5526,7 +5325,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5541,7 +5339,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5556,7 +5353,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5571,7 +5367,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5588,7 +5383,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5603,8 +5397,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5623,8 +5415,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5646,7 +5436,6 @@ entry: define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5663,7 +5452,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5679,7 +5467,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5696,7 +5483,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5711,7 +5497,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5727,7 +5512,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5742,7 +5526,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5757,7 +5540,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5772,7 +5554,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5787,7 +5568,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5804,7 +5584,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5819,8 +5598,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5839,8 +5616,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5862,10 +5637,10 @@ entry: define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5880,9 +5655,9 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5897,10 +5672,9 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -5913,10 +5687,9 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -5929,9 +5702,9 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5946,10 +5719,9 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5962,10 +5734,9 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5978,10 +5749,9 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5994,10 +5764,9 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6010,7 +5779,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6026,7 +5794,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6042,8 +5809,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6059,8 +5824,6 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6085,10 +5848,10 @@ entry: define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6103,9 +5866,9 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6120,10 +5883,9 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6137,10 +5899,9 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6153,9 +5914,9 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6170,10 +5931,9 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6186,10 +5946,9 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6203,10 +5962,9 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6219,10 +5977,9 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6236,7 +5993,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6253,7 +6009,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6269,8 +6024,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6287,8 +6040,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6313,10 +6064,10 @@ entry: define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6332,9 +6083,9 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6350,10 +6101,9 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6368,10 +6118,9 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6385,9 +6134,9 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6403,10 +6152,9 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6420,10 +6168,9 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6437,10 +6184,9 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6454,10 +6200,9 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6471,7 +6216,6 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6489,7 +6233,6 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6506,8 +6249,6 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6527,8 +6268,6 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6554,10 +6293,10 @@ entry: define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6573,9 +6312,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6591,10 +6330,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6610,10 +6348,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6627,9 +6364,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6645,10 +6382,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6662,10 +6398,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6680,10 +6415,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6697,10 +6431,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6715,7 +6448,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6734,7 +6466,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6751,8 +6482,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6773,8 +6502,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6800,10 +6527,10 @@ entry: define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6819,9 +6546,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6837,10 +6564,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6856,10 +6582,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6873,9 +6598,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6891,10 +6616,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6908,10 +6632,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6926,10 +6649,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6943,10 +6665,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6961,7 +6682,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6980,7 +6700,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6997,8 +6716,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7019,8 +6736,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7046,10 +6761,10 @@ entry: define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7064,9 +6779,9 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7081,10 +6796,9 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7098,10 +6812,9 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7114,9 +6827,9 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7131,10 +6844,9 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7147,10 +6859,9 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7164,10 +6875,9 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7180,10 +6890,9 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7197,7 +6906,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7214,7 +6922,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7230,8 +6937,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7248,8 +6953,6 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7274,10 +6977,10 @@ entry: define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7292,9 +6995,9 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7309,10 +7012,9 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7326,10 +7028,9 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7342,9 +7043,9 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7359,10 +7060,9 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7375,10 +7075,9 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7392,10 +7091,9 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7408,10 +7106,9 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7425,7 +7122,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7442,7 +7138,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7458,8 +7153,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7476,8 +7169,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7502,10 +7193,10 @@ entry: define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7521,9 +7212,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7539,10 +7230,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7558,10 +7248,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7575,9 +7264,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7593,10 +7282,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7610,10 +7298,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7628,10 +7315,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7645,10 +7331,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7663,7 +7348,6 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7682,7 +7366,6 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7699,8 +7382,6 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7721,8 +7402,6 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7748,10 +7427,10 @@ entry: define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7767,9 +7446,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7785,10 +7464,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7804,10 +7482,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7821,9 +7498,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7839,10 +7516,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7856,10 +7532,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7874,10 +7549,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7891,10 +7565,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7909,7 +7582,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7928,7 +7600,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7945,8 +7616,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7967,8 +7636,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7994,10 +7661,10 @@ entry: define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8013,9 +7680,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8031,10 +7698,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8050,10 +7716,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8067,9 +7732,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8085,10 +7750,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8102,10 +7766,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8120,10 +7783,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8137,10 +7799,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8155,7 +7816,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8174,7 +7834,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8191,8 +7850,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8213,8 +7870,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8240,10 +7895,10 @@ entry: define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8259,9 +7914,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8277,10 +7932,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8296,10 +7950,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8313,9 +7966,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8331,10 +7984,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8348,10 +8000,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8366,10 +8017,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8383,10 +8033,9 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8401,7 +8050,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8420,7 +8068,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8437,8 +8084,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8459,8 +8104,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8486,10 +8129,10 @@ entry: define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8505,9 +8148,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8523,10 +8166,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8542,10 +8184,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8559,9 +8200,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8577,10 +8218,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8594,10 +8234,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8612,10 +8251,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8629,10 +8267,9 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8647,7 +8284,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8666,7 +8302,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8683,8 +8318,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8705,8 +8338,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8732,10 +8363,10 @@ entry: define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8751,9 +8382,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8769,10 +8400,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8788,10 +8418,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8805,9 +8434,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8823,10 +8452,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8840,10 +8468,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8858,10 +8485,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8875,10 +8501,9 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8893,7 +8518,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8912,7 +8536,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8929,8 +8552,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8951,8 +8572,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8978,10 +8597,10 @@ entry: define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8997,9 +8616,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -9015,10 +8634,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -9034,10 +8652,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -9051,9 +8668,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -9069,10 +8686,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -9086,10 +8702,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -9104,10 +8719,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -9121,10 +8735,9 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -9139,7 +8752,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9158,7 +8770,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9175,8 +8786,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9197,8 +8806,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9224,10 +8831,10 @@ entry: define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -9243,9 +8850,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -9261,10 +8868,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -9280,10 +8886,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -9297,9 +8902,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -9315,10 +8920,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -9332,10 +8936,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -9350,10 +8953,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -9367,10 +8969,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -9385,7 +8986,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9404,7 +9004,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9421,8 +9020,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9443,8 +9040,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9470,8 +9065,8 @@ entry: define amdgpu_kernel void @local_system_one_as_unordered_load( ; GFX6-LABEL: local_system_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9486,8 +9081,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX7-LABEL: local_system_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9500,8 +9095,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX10-WGP-LABEL: local_system_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -9512,8 +9107,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX10-CU-LABEL: local_system_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -9524,8 +9119,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9538,8 +9133,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9550,8 +9145,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9562,8 +9157,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9574,8 +9169,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9586,8 +9181,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX11-WGP-LABEL: local_system_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9598,8 +9193,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX11-CU-LABEL: local_system_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9610,8 +9205,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX12-WGP-LABEL: local_system_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9622,8 +9217,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX12-CU-LABEL: local_system_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9641,8 +9236,8 @@ entry: define amdgpu_kernel void @local_system_one_as_monotonic_load( ; GFX6-LABEL: local_system_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9657,8 +9252,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX7-LABEL: local_system_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9671,8 +9266,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -9683,8 +9278,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -9695,8 +9290,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9709,8 +9304,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9721,8 +9316,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9733,8 +9328,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9745,8 +9340,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9757,8 +9352,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9769,8 +9364,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9781,8 +9376,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9793,8 +9388,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9812,8 +9407,8 @@ entry: define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX6-LABEL: local_system_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9828,8 +9423,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX7-LABEL: local_system_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9842,8 +9437,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -9854,8 +9449,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX10-CU-LABEL: local_system_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -9866,8 +9461,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9880,8 +9475,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9892,8 +9487,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9904,8 +9499,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9916,8 +9511,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9928,8 +9523,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9940,8 +9535,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX11-CU-LABEL: local_system_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9952,8 +9547,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9964,8 +9559,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX12-CU-LABEL: local_system_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9983,8 +9578,8 @@ entry: define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX6-LABEL: local_system_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9999,8 +9594,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX7-LABEL: local_system_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10013,8 +9608,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -10025,8 +9620,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -10037,8 +9632,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10051,8 +9646,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -10063,8 +9658,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -10075,8 +9670,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -10087,8 +9682,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -10099,8 +9694,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -10111,8 +9706,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -10123,8 +9718,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -10135,8 +9730,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -10154,9 +9749,9 @@ entry: define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX6-LABEL: local_system_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10166,8 +9761,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; GFX7-LABEL: local_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10177,8 +9772,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; GFX10-WGP-LABEL: local_system_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10187,8 +9782,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; GFX10-CU-LABEL: local_system_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10197,8 +9792,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10208,8 +9803,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10218,8 +9813,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10228,8 +9823,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10238,8 +9833,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10248,8 +9843,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; GFX11-WGP-LABEL: local_system_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10258,8 +9853,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; GFX11-CU-LABEL: local_system_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10268,8 +9863,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; GFX12-WGP-LABEL: local_system_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10278,8 +9873,8 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; GFX12-CU-LABEL: local_system_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10294,9 +9889,9 @@ entry: define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX6-LABEL: local_system_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10306,8 +9901,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; GFX7-LABEL: local_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10317,8 +9912,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10327,8 +9922,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10337,8 +9932,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10348,8 +9943,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10358,8 +9953,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10368,8 +9963,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10378,8 +9973,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10388,8 +9983,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10398,8 +9993,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10408,8 +10003,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10418,8 +10013,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10434,9 +10029,9 @@ entry: define amdgpu_kernel void @local_system_one_as_release_store( ; GFX6-LABEL: local_system_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10446,8 +10041,8 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; GFX7-LABEL: local_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10457,8 +10052,8 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; GFX10-WGP-LABEL: local_system_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10467,8 +10062,8 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; GFX10-CU-LABEL: local_system_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10477,8 +10072,8 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10488,8 +10083,8 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10498,8 +10093,8 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10508,8 +10103,8 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10518,8 +10113,8 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10528,8 +10123,8 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; GFX11-WGP-LABEL: local_system_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10538,8 +10133,8 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; GFX11-CU-LABEL: local_system_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10548,8 +10143,8 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; GFX12-WGP-LABEL: local_system_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10558,8 +10153,8 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; GFX12-CU-LABEL: local_system_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10574,9 +10169,9 @@ entry: define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX6-LABEL: local_system_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10586,8 +10181,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; GFX7-LABEL: local_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10597,8 +10192,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10607,8 +10202,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10617,8 +10212,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10628,8 +10223,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10638,8 +10233,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10648,8 +10243,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10658,8 +10253,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10668,8 +10263,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10678,8 +10273,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10688,8 +10283,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10698,8 +10293,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10714,9 +10309,9 @@ entry: define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10726,8 +10321,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10737,8 +10332,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10747,8 +10342,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10757,8 +10352,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10768,8 +10363,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10778,8 +10373,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10788,8 +10383,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10798,8 +10393,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10808,8 +10403,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10818,8 +10413,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10828,8 +10423,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10838,8 +10433,8 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10854,9 +10449,9 @@ entry: define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10866,8 +10461,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10877,8 +10472,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10887,8 +10482,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10897,8 +10492,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10908,8 +10503,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10918,8 +10513,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10928,8 +10523,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10938,8 +10533,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10948,8 +10543,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10958,8 +10553,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10968,8 +10563,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10978,8 +10573,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10994,9 +10589,9 @@ entry: define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX6-LABEL: local_system_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -11006,8 +10601,8 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; GFX7-LABEL: local_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -11017,8 +10612,8 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -11027,8 +10622,8 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: local_system_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -11037,8 +10632,8 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -11048,8 +10643,8 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11058,8 +10653,8 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11068,8 +10663,8 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11078,8 +10673,8 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11088,8 +10683,8 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11098,8 +10693,8 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: local_system_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11108,8 +10703,8 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11118,8 +10713,8 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: local_system_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11134,9 +10729,9 @@ entry: define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -11146,8 +10741,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -11157,8 +10752,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -11167,8 +10762,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -11177,8 +10772,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -11188,8 +10783,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11198,8 +10793,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11208,8 +10803,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11218,8 +10813,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11228,8 +10823,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11238,8 +10833,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11248,8 +10843,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11258,8 +10853,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11274,9 +10869,9 @@ entry: define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -11286,8 +10881,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -11297,8 +10892,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -11307,8 +10902,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -11317,8 +10912,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -11328,8 +10923,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11338,8 +10933,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11348,8 +10943,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11358,8 +10953,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11368,8 +10963,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11378,8 +10973,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11388,8 +10983,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11398,8 +10993,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11414,9 +11009,9 @@ entry: define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -11430,8 +11025,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -11445,8 +11040,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -11458,8 +11053,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -11471,8 +11066,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -11486,8 +11081,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11499,8 +11094,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11512,8 +11107,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11525,8 +11120,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11538,8 +11133,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11551,8 +11146,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11564,8 +11159,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11577,8 +11172,8 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11597,9 +11192,9 @@ entry: define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -11613,8 +11208,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -11628,8 +11223,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -11641,8 +11236,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -11654,8 +11249,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -11669,8 +11264,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11682,8 +11277,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11695,8 +11290,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11708,8 +11303,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11721,8 +11316,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11734,8 +11329,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11747,8 +11342,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11760,8 +11355,8 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11780,9 +11375,9 @@ entry: define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -11796,8 +11391,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -11811,8 +11406,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -11824,8 +11419,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -11837,8 +11432,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -11852,8 +11447,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11865,8 +11460,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11878,8 +11473,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11891,8 +11486,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11904,8 +11499,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11917,8 +11512,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11930,8 +11525,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11943,8 +11538,8 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11963,7 +11558,6 @@ entry: define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11978,7 +11572,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11992,7 +11585,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12005,7 +11597,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12018,7 +11609,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12032,7 +11622,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12045,7 +11634,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12058,7 +11646,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12071,7 +11658,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12084,7 +11670,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12097,7 +11682,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12110,8 +11694,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12124,8 +11706,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12145,7 +11725,6 @@ entry: define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12160,7 +11739,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12174,7 +11752,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12187,7 +11764,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12200,7 +11776,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12214,7 +11789,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12227,7 +11801,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12240,7 +11813,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12253,7 +11825,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12266,7 +11837,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12279,7 +11849,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12292,8 +11861,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12306,8 +11873,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12327,7 +11892,6 @@ entry: define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12342,7 +11906,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12356,7 +11919,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12369,7 +11931,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12382,7 +11943,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12396,7 +11956,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12409,7 +11968,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12422,7 +11980,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12435,7 +11992,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12448,7 +12004,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12461,7 +12016,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12474,8 +12028,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12488,8 +12040,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12509,7 +12059,6 @@ entry: define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12524,7 +12073,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12538,7 +12086,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12551,7 +12098,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12564,7 +12110,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12578,7 +12123,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12591,7 +12135,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12604,7 +12147,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12617,7 +12159,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12630,7 +12171,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12643,7 +12183,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12656,8 +12195,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12670,8 +12207,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12691,7 +12226,6 @@ entry: define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12706,7 +12240,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12720,7 +12253,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12733,7 +12265,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12746,7 +12277,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12760,7 +12290,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12773,7 +12302,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12786,7 +12314,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12799,7 +12326,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12812,7 +12338,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12825,7 +12350,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12838,8 +12362,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12852,8 +12374,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12873,7 +12393,6 @@ entry: define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12888,7 +12407,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12902,7 +12420,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12915,7 +12432,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12928,7 +12444,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12942,7 +12457,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12955,7 +12469,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12968,7 +12481,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12981,7 +12493,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12994,7 +12505,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13007,7 +12517,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13020,8 +12529,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13034,8 +12541,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13055,7 +12560,6 @@ entry: define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13070,7 +12574,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13084,7 +12587,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13097,7 +12599,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13110,7 +12611,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13124,7 +12624,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13137,7 +12636,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13150,7 +12648,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13163,7 +12660,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13176,7 +12672,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13189,7 +12684,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13202,8 +12696,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13216,8 +12708,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13237,7 +12727,6 @@ entry: define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13252,7 +12741,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13266,7 +12754,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13279,7 +12766,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13292,7 +12778,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13306,7 +12791,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13319,7 +12803,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13332,7 +12815,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13345,7 +12827,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13358,7 +12839,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13371,7 +12851,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13384,8 +12863,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13398,8 +12875,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13419,7 +12894,6 @@ entry: define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13434,7 +12908,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13448,7 +12921,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13461,7 +12933,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13474,7 +12945,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13488,7 +12958,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13501,7 +12970,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13514,7 +12982,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13527,7 +12994,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13540,7 +13006,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13553,7 +13018,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13566,8 +13030,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13580,8 +13042,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13601,7 +13061,6 @@ entry: define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13616,7 +13075,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13630,7 +13088,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13643,7 +13100,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13656,7 +13112,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13670,7 +13125,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13683,7 +13137,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13696,7 +13149,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13709,7 +13161,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13722,7 +13173,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13735,7 +13185,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13748,8 +13197,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13762,8 +13209,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13783,7 +13228,6 @@ entry: define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13798,7 +13242,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13812,7 +13255,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13825,7 +13267,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13838,7 +13279,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13852,7 +13292,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13865,7 +13304,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13878,7 +13316,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13891,7 +13328,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13904,7 +13340,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13917,7 +13352,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13930,8 +13364,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13944,8 +13376,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13965,7 +13395,6 @@ entry: define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13980,7 +13409,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13994,7 +13422,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14007,7 +13434,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14020,7 +13446,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -14034,7 +13459,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14047,7 +13471,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14060,7 +13483,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14073,7 +13495,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14086,7 +13507,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14099,7 +13519,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14112,8 +13531,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14126,8 +13543,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14147,7 +13562,6 @@ entry: define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -14162,7 +13576,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -14176,7 +13589,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14189,7 +13601,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14202,7 +13613,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -14216,7 +13626,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14229,7 +13638,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14242,7 +13650,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14255,7 +13662,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14268,7 +13674,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14281,7 +13686,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14294,8 +13698,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14308,8 +13710,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14329,7 +13729,6 @@ entry: define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -14344,7 +13743,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -14358,7 +13756,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14371,7 +13768,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14384,7 +13780,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -14398,7 +13793,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14411,7 +13805,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14424,7 +13817,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14437,7 +13829,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14450,7 +13841,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14463,7 +13853,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14476,8 +13865,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14490,8 +13877,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14511,7 +13896,6 @@ entry: define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -14526,7 +13910,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -14540,7 +13923,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14553,7 +13935,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14566,7 +13947,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -14580,7 +13960,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14593,7 +13972,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14606,7 +13984,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14619,7 +13996,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14632,7 +14008,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14645,7 +14020,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14658,8 +14032,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14672,8 +14044,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14693,10 +14063,10 @@ entry: define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14711,9 +14081,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14728,10 +14098,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14744,10 +14113,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14760,9 +14128,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14777,10 +14145,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14793,10 +14160,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14809,10 +14175,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14825,10 +14190,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14841,7 +14205,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14857,7 +14220,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14873,8 +14235,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14890,8 +14250,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14916,10 +14274,10 @@ entry: define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14934,9 +14292,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14951,10 +14309,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14967,10 +14324,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14983,9 +14339,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15000,10 +14356,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15016,10 +14371,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15032,10 +14386,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15048,10 +14401,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15064,7 +14416,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15080,7 +14431,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15096,8 +14446,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15113,8 +14461,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15139,10 +14485,10 @@ entry: define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15157,9 +14503,9 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15174,10 +14520,9 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15190,10 +14535,9 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15206,9 +14550,9 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15223,10 +14567,9 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15239,10 +14582,9 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15255,10 +14597,9 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15271,10 +14612,9 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15287,7 +14627,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15303,7 +14642,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15319,8 +14657,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15336,8 +14672,6 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15362,10 +14696,10 @@ entry: define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15380,9 +14714,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15397,10 +14731,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15413,10 +14746,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15429,9 +14761,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15446,10 +14778,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15462,10 +14793,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15478,10 +14808,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15494,10 +14823,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15510,7 +14838,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15526,7 +14853,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15542,8 +14868,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15559,8 +14883,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15585,10 +14907,10 @@ entry: define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15603,9 +14925,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15620,10 +14942,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15636,10 +14957,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15652,9 +14972,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15669,10 +14989,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15685,10 +15004,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15701,10 +15019,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15717,10 +15034,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15733,7 +15049,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15749,7 +15064,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15765,8 +15079,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15782,8 +15094,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15808,10 +15118,10 @@ entry: define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15826,9 +15136,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15843,10 +15153,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15859,10 +15168,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15875,9 +15183,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15892,10 +15200,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15908,10 +15215,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15924,10 +15230,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15940,10 +15245,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15956,7 +15260,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15972,7 +15275,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15988,8 +15290,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16005,8 +15305,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16031,10 +15329,10 @@ entry: define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16049,9 +15347,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16066,10 +15364,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16082,10 +15379,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16098,9 +15394,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16115,10 +15411,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16131,10 +15426,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16147,10 +15441,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16163,10 +15456,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16179,7 +15471,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16195,7 +15486,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16211,8 +15501,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16228,8 +15516,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16254,10 +15540,10 @@ entry: define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16272,9 +15558,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16289,10 +15575,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16305,10 +15590,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16321,9 +15605,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16338,10 +15622,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16354,10 +15637,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16370,10 +15652,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16386,10 +15667,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16402,7 +15682,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16418,7 +15697,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16434,8 +15712,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16451,8 +15727,6 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16477,10 +15751,10 @@ entry: define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16495,9 +15769,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16512,10 +15786,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16528,10 +15801,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16544,9 +15816,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16561,10 +15833,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16577,10 +15848,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16593,10 +15863,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16609,10 +15878,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16625,7 +15893,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16641,7 +15908,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16657,8 +15923,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16674,8 +15938,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16700,10 +15962,10 @@ entry: define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16718,9 +15980,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16735,10 +15997,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16751,10 +16012,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16767,9 +16027,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16784,10 +16044,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16800,10 +16059,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16816,10 +16074,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16832,10 +16089,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16848,7 +16104,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16864,7 +16119,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16880,8 +16134,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16897,8 +16149,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16923,10 +16173,10 @@ entry: define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16941,9 +16191,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16958,10 +16208,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16974,10 +16223,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16990,9 +16238,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17007,10 +16255,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17023,10 +16270,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17039,10 +16285,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17055,10 +16300,9 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17071,7 +16315,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17087,7 +16330,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17103,8 +16345,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17120,8 +16360,6 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17146,10 +16384,10 @@ entry: define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -17164,9 +16402,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -17181,10 +16419,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -17197,10 +16434,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -17213,9 +16449,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17230,10 +16466,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17246,10 +16481,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17262,10 +16496,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17278,10 +16511,9 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17294,7 +16526,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17310,7 +16541,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17326,8 +16556,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17343,8 +16571,6 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17369,10 +16595,10 @@ entry: define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -17387,9 +16613,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -17404,10 +16630,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -17420,10 +16645,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -17436,9 +16660,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17453,10 +16677,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17469,10 +16692,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17485,10 +16707,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17501,10 +16722,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17517,7 +16737,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17533,7 +16752,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17549,8 +16767,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17566,8 +16782,6 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17592,10 +16806,10 @@ entry: define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -17610,9 +16824,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -17627,10 +16841,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -17643,10 +16856,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -17659,9 +16871,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17676,10 +16888,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17692,10 +16903,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17708,10 +16918,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17724,10 +16933,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17740,7 +16948,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17756,7 +16963,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17772,8 +16978,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17789,8 +16993,6 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17815,10 +17017,10 @@ entry: define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -17833,9 +17035,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -17850,10 +17052,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -17866,10 +17067,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -17882,9 +17082,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17899,10 +17099,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17915,10 +17114,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17931,10 +17129,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17947,10 +17144,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17963,7 +17159,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17979,7 +17174,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17995,8 +17189,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18012,8 +17204,6 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 4439f9ef818a97..9740e0ae1d1671 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -12,9 +12,10 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX6-LABEL: local_volatile_load_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x9 -; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr4 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: ; kill: def $sgpr2 killed $sgpr4 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s7, s1 ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -33,9 +34,8 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX7-LABEL: local_volatile_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 @@ -48,10 +48,8 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX10-WGP-LABEL: local_volatile_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -62,10 +60,8 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX10-CU-LABEL: local_volatile_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -76,8 +72,9 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; SKIP-CACHE-INV-LABEL: local_volatile_load_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -96,9 +93,8 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX11-WGP-LABEL: local_volatile_load_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -109,9 +105,8 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX11-CU-LABEL: local_volatile_load_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -122,10 +117,8 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX12-WGP-LABEL: local_volatile_load_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 @@ -136,10 +129,8 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX12-CU-LABEL: local_volatile_load_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 @@ -157,9 +148,10 @@ entry: define amdgpu_kernel void @local_volatile_load_1( ; GFX6-LABEL: local_volatile_load_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x9 -; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr4 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: ; kill: def $sgpr2 killed $sgpr4 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s7, s1 ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -180,9 +172,8 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; GFX7-LABEL: local_volatile_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -197,11 +188,9 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; GFX10-WGP-LABEL: local_volatile_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -213,11 +202,9 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; GFX10-CU-LABEL: local_volatile_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -229,8 +216,9 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; SKIP-CACHE-INV-LABEL: local_volatile_load_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -251,10 +239,9 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; GFX11-WGP-LABEL: local_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 @@ -268,10 +255,9 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; GFX11-CU-LABEL: local_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 @@ -285,11 +271,9 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; GFX12-WGP-LABEL: local_volatile_load_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -305,11 +289,9 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; GFX12-CU-LABEL: local_volatile_load_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-CU-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -334,9 +316,9 @@ entry: define amdgpu_kernel void @local_volatile_store_0( ; GFX6-LABEL: local_volatile_store_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s1, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s1, s[4:5], 0xb ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr1 -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -348,9 +330,8 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX7-LABEL: local_volatile_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -362,10 +343,8 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX10-WGP-LABEL: local_volatile_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s5, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 @@ -376,10 +355,8 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX10-CU-LABEL: local_volatile_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s5, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 @@ -390,9 +367,8 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; SKIP-CACHE-INV-LABEL: local_volatile_store_0: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 @@ -404,9 +380,8 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX11-WGP-LABEL: local_volatile_store_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -417,9 +392,8 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX11-CU-LABEL: local_volatile_store_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -430,10 +404,8 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX12-WGP-LABEL: local_volatile_store_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 @@ -449,10 +421,8 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX12-CU-LABEL: local_volatile_store_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 @@ -475,9 +445,9 @@ entry: define amdgpu_kernel void @local_volatile_store_1( ; GFX6-LABEL: local_volatile_store_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s1, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s1, s[4:5], 0xb ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr1 -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 2 @@ -491,9 +461,8 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX7-LABEL: local_volatile_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 2 @@ -507,9 +476,8 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX10-WGP-LABEL: local_volatile_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_mov_b32 s5, 2 @@ -521,9 +489,8 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX10-CU-LABEL: local_volatile_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-CU-NEXT: s_mov_b32 s5, 2 @@ -535,9 +502,8 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; SKIP-CACHE-INV-LABEL: local_volatile_store_1: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 @@ -551,8 +517,8 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX11-WGP-LABEL: local_volatile_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff @@ -566,8 +532,8 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX11-CU-LABEL: local_volatile_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff @@ -581,8 +547,8 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX12-WGP-LABEL: local_volatile_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff @@ -603,8 +569,8 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX12-CU-LABEL: local_volatile_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff @@ -634,8 +600,8 @@ entry: define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; GFX6-LABEL: local_volatile_workgroup_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s1, s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xa +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x9 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xa ; GFX6-NEXT: ; kill: def $sgpr2 killed $sgpr0 ; GFX6-NEXT: ; kill: def $sgpr2 killed $sgpr1 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -650,8 +616,8 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX7-LABEL: local_volatile_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -664,8 +630,8 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: local_volatile_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -677,8 +643,8 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX10-CU-LABEL: local_volatile_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -689,8 +655,8 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -703,8 +669,8 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX11-WGP-LABEL: local_volatile_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -716,8 +682,8 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX11-CU-LABEL: local_volatile_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -728,8 +694,8 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX12-WGP-LABEL: local_volatile_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -741,8 +707,8 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX12-CU-LABEL: local_volatile_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -760,9 +726,9 @@ entry: define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX6-LABEL: local_volatile_workgroup_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s1, s[2:3], 0xa +; GFX6-NEXT: s_load_dword s1, s[4:5], 0xa ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr1 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s1 @@ -773,8 +739,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; ; GFX7-LABEL: local_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -785,8 +751,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; ; GFX10-WGP-LABEL: local_volatile_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -797,8 +763,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; ; GFX10-CU-LABEL: local_volatile_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -808,8 +774,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; ; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -820,8 +786,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; ; GFX11-WGP-LABEL: local_volatile_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -832,8 +798,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; ; GFX11-CU-LABEL: local_volatile_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -843,8 +809,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; ; GFX12-WGP-LABEL: local_volatile_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -857,8 +823,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; ; GFX12-CU-LABEL: local_volatile_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll index 694ffb2964f569..056559834ca1c5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -16,8 +16,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX6-LABEL: local_wavefront_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -32,8 +32,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX7-LABEL: local_wavefront_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -46,8 +46,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX10-WGP-LABEL: local_wavefront_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -58,8 +58,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX10-CU-LABEL: local_wavefront_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -70,8 +70,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -84,8 +84,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -96,8 +96,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -108,8 +108,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -120,8 +120,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -132,8 +132,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX11-WGP-LABEL: local_wavefront_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -144,8 +144,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX11-CU-LABEL: local_wavefront_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -156,8 +156,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX12-WGP-LABEL: local_wavefront_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -168,8 +168,8 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX12-CU-LABEL: local_wavefront_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -187,8 +187,8 @@ entry: define amdgpu_kernel void @local_wavefront_monotonic_load( ; GFX6-LABEL: local_wavefront_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -203,8 +203,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX7-LABEL: local_wavefront_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -217,8 +217,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -229,8 +229,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX10-CU-LABEL: local_wavefront_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -241,8 +241,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -255,8 +255,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -267,8 +267,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -279,8 +279,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -291,8 +291,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -303,8 +303,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -315,8 +315,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX11-CU-LABEL: local_wavefront_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -327,8 +327,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -339,8 +339,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX12-CU-LABEL: local_wavefront_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -358,8 +358,8 @@ entry: define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX6-LABEL: local_wavefront_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -374,8 +374,8 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX7-LABEL: local_wavefront_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -388,8 +388,8 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX10-WGP-LABEL: local_wavefront_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -400,8 +400,8 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX10-CU-LABEL: local_wavefront_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -412,8 +412,8 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -426,8 +426,8 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -438,8 +438,8 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -450,8 +450,8 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -462,8 +462,8 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -474,8 +474,8 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX11-WGP-LABEL: local_wavefront_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -486,8 +486,8 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX11-CU-LABEL: local_wavefront_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -498,8 +498,8 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX12-WGP-LABEL: local_wavefront_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -510,8 +510,8 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX12-CU-LABEL: local_wavefront_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -529,8 +529,8 @@ entry: define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX6-LABEL: local_wavefront_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -545,8 +545,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX7-LABEL: local_wavefront_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -559,8 +559,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -571,8 +571,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -583,8 +583,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -597,8 +597,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -609,8 +609,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -621,8 +621,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -633,8 +633,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -645,8 +645,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -657,8 +657,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -669,8 +669,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -681,8 +681,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -700,9 +700,9 @@ entry: define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX6-LABEL: local_wavefront_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -712,8 +712,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; GFX7-LABEL: local_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -723,8 +723,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; GFX10-WGP-LABEL: local_wavefront_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -733,8 +733,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; GFX10-CU-LABEL: local_wavefront_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -743,8 +743,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -754,8 +754,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -764,8 +764,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -774,8 +774,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -784,8 +784,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -794,8 +794,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; GFX11-WGP-LABEL: local_wavefront_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -804,8 +804,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; GFX11-CU-LABEL: local_wavefront_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -814,8 +814,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; GFX12-WGP-LABEL: local_wavefront_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -824,8 +824,8 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; GFX12-CU-LABEL: local_wavefront_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -840,9 +840,9 @@ entry: define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX6-LABEL: local_wavefront_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -852,8 +852,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; GFX7-LABEL: local_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -863,8 +863,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -873,8 +873,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; GFX10-CU-LABEL: local_wavefront_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -883,8 +883,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -894,8 +894,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -904,8 +904,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -914,8 +914,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -924,8 +924,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -934,8 +934,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -944,8 +944,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; GFX11-CU-LABEL: local_wavefront_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -954,8 +954,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -964,8 +964,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; GFX12-CU-LABEL: local_wavefront_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -980,9 +980,9 @@ entry: define amdgpu_kernel void @local_wavefront_release_store( ; GFX6-LABEL: local_wavefront_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -992,8 +992,8 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; GFX7-LABEL: local_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1003,8 +1003,8 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; GFX10-WGP-LABEL: local_wavefront_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1013,8 +1013,8 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; GFX10-CU-LABEL: local_wavefront_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1023,8 +1023,8 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1034,8 +1034,8 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1044,8 +1044,8 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1054,8 +1054,8 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1064,8 +1064,8 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1074,8 +1074,8 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; GFX11-WGP-LABEL: local_wavefront_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1084,8 +1084,8 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; GFX11-CU-LABEL: local_wavefront_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1094,8 +1094,8 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; GFX12-WGP-LABEL: local_wavefront_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1104,8 +1104,8 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; GFX12-CU-LABEL: local_wavefront_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1120,9 +1120,9 @@ entry: define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX6-LABEL: local_wavefront_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1132,8 +1132,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; GFX7-LABEL: local_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1143,8 +1143,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1153,8 +1153,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1163,8 +1163,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1174,8 +1174,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1184,8 +1184,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1194,8 +1194,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1204,8 +1204,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1214,8 +1214,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1224,8 +1224,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1234,8 +1234,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1244,8 +1244,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1260,9 +1260,9 @@ entry: define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; GFX6-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1272,8 +1272,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; GFX7-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1283,8 +1283,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1293,8 +1293,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1303,8 +1303,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1314,8 +1314,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1324,8 +1324,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1334,8 +1334,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1344,8 +1344,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1354,8 +1354,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1364,8 +1364,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1374,8 +1374,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1384,8 +1384,8 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1400,9 +1400,9 @@ entry: define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX6-LABEL: local_wavefront_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1412,8 +1412,8 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; GFX7-LABEL: local_wavefront_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1423,8 +1423,8 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1433,8 +1433,8 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1443,8 +1443,8 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1454,8 +1454,8 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1464,8 +1464,8 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1474,8 +1474,8 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1484,8 +1484,8 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1494,8 +1494,8 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1504,8 +1504,8 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1514,8 +1514,8 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1524,8 +1524,8 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1540,9 +1540,9 @@ entry: define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX6-LABEL: local_wavefront_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1552,8 +1552,8 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; GFX7-LABEL: local_wavefront_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1563,8 +1563,8 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1573,8 +1573,8 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1583,8 +1583,8 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1594,8 +1594,8 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1604,8 +1604,8 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1614,8 +1614,8 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1624,8 +1624,8 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1634,8 +1634,8 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1644,8 +1644,8 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1654,8 +1654,8 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1664,8 +1664,8 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1680,9 +1680,9 @@ entry: define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX6-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1692,8 +1692,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1703,8 +1703,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1713,8 +1713,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1723,8 +1723,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1734,8 +1734,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1744,8 +1744,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1754,8 +1754,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1764,8 +1764,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1774,8 +1774,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1784,8 +1784,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1794,8 +1794,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1804,8 +1804,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1820,9 +1820,9 @@ entry: define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX6-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1832,8 +1832,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1843,8 +1843,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1853,8 +1853,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1863,8 +1863,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1874,8 +1874,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1884,8 +1884,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1894,8 +1894,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1904,8 +1904,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1914,8 +1914,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1924,8 +1924,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1934,8 +1934,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1944,8 +1944,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1960,9 +1960,9 @@ entry: define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -1976,8 +1976,8 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; GFX7-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -1991,8 +1991,8 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2004,8 +2004,8 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2017,8 +2017,8 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2032,8 +2032,8 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2045,8 +2045,8 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2058,8 +2058,8 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2071,8 +2071,8 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2084,8 +2084,8 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2097,8 +2097,8 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2110,8 +2110,8 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2123,8 +2123,8 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2143,9 +2143,9 @@ entry: define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2159,8 +2159,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2174,8 +2174,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2187,8 +2187,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2200,8 +2200,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2215,8 +2215,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2228,8 +2228,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2241,8 +2241,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2254,8 +2254,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2267,8 +2267,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2280,8 +2280,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2293,8 +2293,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2306,8 +2306,8 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2326,9 +2326,9 @@ entry: define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2342,8 +2342,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2357,8 +2357,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2370,8 +2370,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2383,8 +2383,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2398,8 +2398,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2411,8 +2411,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2424,8 +2424,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2437,8 +2437,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2450,8 +2450,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2463,8 +2463,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2476,8 +2476,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2489,8 +2489,8 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2509,7 +2509,6 @@ entry: define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -2524,7 +2523,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -2538,7 +2536,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2551,7 +2548,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2564,7 +2560,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -2578,7 +2573,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2591,7 +2585,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2604,7 +2597,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2617,7 +2609,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2630,7 +2621,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2643,7 +2633,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2656,8 +2645,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2670,8 +2657,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2691,7 +2676,6 @@ entry: define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -2706,7 +2690,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -2720,7 +2703,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2733,7 +2715,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2746,7 +2727,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -2760,7 +2740,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2773,7 +2752,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2786,7 +2764,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2799,7 +2776,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2812,7 +2788,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2825,7 +2800,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2838,8 +2812,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2852,8 +2824,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2873,7 +2843,6 @@ entry: define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -2888,7 +2857,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -2902,7 +2870,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2915,7 +2882,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2928,7 +2894,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -2942,7 +2907,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2955,7 +2919,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2968,7 +2931,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2981,7 +2943,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2994,7 +2955,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3007,7 +2967,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3020,8 +2979,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3034,8 +2991,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3055,7 +3010,6 @@ entry: define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3070,7 +3024,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3084,7 +3037,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3097,7 +3049,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3110,7 +3061,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3124,7 +3074,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3137,7 +3086,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3150,7 +3098,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3163,7 +3110,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3176,7 +3122,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3189,7 +3134,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3202,8 +3146,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3216,8 +3158,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3237,7 +3177,6 @@ entry: define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3252,7 +3191,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3266,7 +3204,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3279,7 +3216,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3292,7 +3228,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3306,7 +3241,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3319,7 +3253,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3332,7 +3265,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3345,7 +3277,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3358,7 +3289,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3371,7 +3301,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3384,8 +3313,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3398,8 +3325,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3419,7 +3344,6 @@ entry: define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3434,7 +3358,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3448,7 +3371,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3461,7 +3383,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3474,7 +3395,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3488,7 +3408,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3501,7 +3420,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3514,7 +3432,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3527,7 +3444,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3540,7 +3456,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3553,7 +3468,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3566,8 +3480,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3580,8 +3492,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3601,7 +3511,6 @@ entry: define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3616,7 +3525,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3630,7 +3538,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3643,7 +3550,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3656,7 +3562,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3670,7 +3575,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3683,7 +3587,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3696,7 +3599,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3709,7 +3611,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3722,7 +3623,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3735,7 +3635,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3748,8 +3647,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3762,8 +3659,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3783,7 +3678,6 @@ entry: define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3798,7 +3692,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3812,7 +3705,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3825,7 +3717,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3838,7 +3729,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3852,7 +3742,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3865,7 +3754,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3878,7 +3766,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3891,7 +3778,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3904,7 +3790,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3917,7 +3802,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3930,8 +3814,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3944,8 +3826,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3965,7 +3845,6 @@ entry: define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3980,7 +3859,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3994,7 +3872,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4007,7 +3884,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4020,7 +3896,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4034,7 +3909,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4047,7 +3921,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4060,7 +3933,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4073,7 +3945,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4086,7 +3957,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4099,7 +3969,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4112,8 +3981,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4126,8 +3993,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4147,7 +4012,6 @@ entry: define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4162,7 +4026,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4176,7 +4039,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4189,7 +4051,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4202,7 +4063,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4216,7 +4076,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4229,7 +4088,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4242,7 +4100,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4255,7 +4112,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4268,7 +4124,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4281,7 +4136,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4294,8 +4148,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4308,8 +4160,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4329,7 +4179,6 @@ entry: define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4344,7 +4193,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4358,7 +4206,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4371,7 +4218,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4384,7 +4230,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4398,7 +4243,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4411,7 +4255,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4424,7 +4267,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4437,7 +4279,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4450,7 +4291,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4463,7 +4303,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4476,8 +4315,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4490,8 +4327,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4511,7 +4346,6 @@ entry: define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4526,7 +4360,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4540,7 +4373,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4553,7 +4385,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4566,7 +4397,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4580,7 +4410,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4593,7 +4422,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4606,7 +4434,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4619,7 +4446,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4632,7 +4458,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4645,7 +4470,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4658,8 +4482,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4672,8 +4494,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4693,7 +4513,6 @@ entry: define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4708,7 +4527,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4722,7 +4540,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4735,7 +4552,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4748,7 +4564,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4762,7 +4577,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4775,7 +4589,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4788,7 +4601,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4801,7 +4613,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4814,7 +4625,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4827,7 +4637,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4840,8 +4649,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4854,8 +4661,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4875,7 +4680,6 @@ entry: define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4890,7 +4694,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4904,7 +4707,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4917,7 +4719,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4930,7 +4731,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4944,7 +4744,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4957,7 +4756,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4970,7 +4768,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4983,7 +4780,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4996,7 +4792,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5009,7 +4804,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5022,8 +4816,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5036,8 +4828,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5057,7 +4847,6 @@ entry: define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5072,7 +4861,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5086,7 +4874,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5099,7 +4886,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5112,7 +4898,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5126,7 +4911,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5139,7 +4923,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5152,7 +4935,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5165,7 +4947,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5178,7 +4959,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5191,7 +4971,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5204,8 +4983,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5218,8 +4995,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5239,10 +5014,10 @@ entry: define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5257,9 +5032,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5274,10 +5049,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -5290,10 +5064,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -5306,9 +5079,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5323,10 +5096,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5339,10 +5111,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5355,10 +5126,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5371,10 +5141,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5387,7 +5156,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5403,7 +5171,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5419,8 +5186,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5436,8 +5201,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5462,10 +5225,10 @@ entry: define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5480,9 +5243,9 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5497,10 +5260,9 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -5513,10 +5275,9 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -5529,9 +5290,9 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5546,10 +5307,9 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5562,10 +5322,9 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5578,10 +5337,9 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5594,10 +5352,9 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5610,7 +5367,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5626,7 +5382,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5642,8 +5397,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5659,8 +5412,6 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5685,10 +5436,10 @@ entry: define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5703,9 +5454,9 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5720,10 +5471,9 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -5736,10 +5486,9 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -5752,9 +5501,9 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5769,10 +5518,9 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5785,10 +5533,9 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5801,10 +5548,9 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5817,10 +5563,9 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5833,7 +5578,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5849,7 +5593,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5865,8 +5608,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5882,8 +5623,6 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5908,10 +5647,10 @@ entry: define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5926,9 +5665,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5943,10 +5682,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -5959,10 +5697,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -5975,9 +5712,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5992,10 +5729,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6008,10 +5744,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6024,10 +5759,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6040,10 +5774,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6056,7 +5789,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6072,7 +5804,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6088,8 +5819,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6105,8 +5834,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6131,10 +5858,10 @@ entry: define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6149,9 +5876,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6166,10 +5893,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6182,10 +5908,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6198,9 +5923,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6215,10 +5940,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6231,10 +5955,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6247,10 +5970,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6263,10 +5985,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6279,7 +6000,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6295,7 +6015,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6311,8 +6030,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6328,8 +6045,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6354,10 +6069,10 @@ entry: define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6372,9 +6087,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6389,10 +6104,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6405,10 +6119,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6421,9 +6134,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6438,10 +6151,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6454,10 +6166,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6470,10 +6181,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6486,10 +6196,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6502,7 +6211,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6518,7 +6226,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6534,8 +6241,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6551,8 +6256,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6577,10 +6280,10 @@ entry: define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6595,9 +6298,9 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6612,10 +6315,9 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6628,10 +6330,9 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6644,9 +6345,9 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6661,10 +6362,9 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6677,10 +6377,9 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6693,10 +6392,9 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6709,10 +6407,9 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6725,7 +6422,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6741,7 +6437,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6757,8 +6452,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6774,8 +6467,6 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6800,10 +6491,10 @@ entry: define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6818,9 +6509,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6835,10 +6526,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6851,10 +6541,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6867,9 +6556,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6884,10 +6573,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6900,10 +6588,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6916,10 +6603,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6932,10 +6618,9 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6948,7 +6633,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6964,7 +6648,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6980,8 +6663,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6997,8 +6678,6 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7023,10 +6702,10 @@ entry: define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7041,9 +6720,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7058,10 +6737,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7074,10 +6752,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7090,9 +6767,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7107,10 +6784,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7123,10 +6799,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7139,10 +6814,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7155,10 +6829,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7171,7 +6844,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7187,7 +6859,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7203,8 +6874,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7220,8 +6889,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7246,10 +6913,10 @@ entry: define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7264,9 +6931,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7281,10 +6948,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7297,10 +6963,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7313,9 +6978,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7330,10 +6995,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7346,10 +7010,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7362,10 +7025,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7378,10 +7040,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7394,7 +7055,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7410,7 +7070,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7426,8 +7085,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7443,8 +7100,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7469,10 +7124,10 @@ entry: define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7487,9 +7142,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7504,10 +7159,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7520,10 +7174,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7536,9 +7189,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7553,10 +7206,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7569,10 +7221,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7585,10 +7236,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7601,10 +7251,9 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7617,7 +7266,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7633,7 +7281,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7649,8 +7296,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7666,8 +7311,6 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7692,10 +7335,10 @@ entry: define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7710,9 +7353,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7727,10 +7370,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7743,10 +7385,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7759,9 +7400,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7776,10 +7417,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7792,10 +7432,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7808,10 +7447,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7824,10 +7462,9 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7840,7 +7477,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7856,7 +7492,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7872,8 +7507,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7889,8 +7522,6 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7915,10 +7546,10 @@ entry: define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7933,9 +7564,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7950,10 +7581,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7966,10 +7596,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7982,9 +7611,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7999,10 +7628,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8015,10 +7643,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8031,10 +7658,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8047,10 +7673,9 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8063,7 +7688,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8079,7 +7703,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8095,8 +7718,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8112,8 +7733,6 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8138,10 +7757,10 @@ entry: define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8156,9 +7775,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8173,10 +7792,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8189,10 +7807,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8205,9 +7822,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8222,10 +7839,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8238,10 +7854,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8254,10 +7869,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8270,10 +7884,9 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8286,7 +7899,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8302,7 +7914,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8318,8 +7929,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8335,8 +7944,6 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8361,10 +7968,10 @@ entry: define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8379,9 +7986,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8396,10 +8003,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8412,10 +8018,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8428,9 +8033,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8445,10 +8050,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8461,10 +8065,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8477,10 +8080,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8493,10 +8095,9 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8509,7 +8110,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8525,7 +8125,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8541,8 +8140,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8558,8 +8155,6 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8584,8 +8179,8 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; GFX6-LABEL: local_wavefront_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -8600,8 +8195,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX7-LABEL: local_wavefront_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -8614,8 +8209,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -8626,8 +8221,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX10-CU-LABEL: local_wavefront_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -8638,8 +8233,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -8652,8 +8247,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8664,8 +8259,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8676,8 +8271,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8688,8 +8283,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8700,8 +8295,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -8712,8 +8307,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX11-CU-LABEL: local_wavefront_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -8724,8 +8319,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -8736,8 +8331,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX12-CU-LABEL: local_wavefront_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -8755,8 +8350,8 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; GFX6-LABEL: local_wavefront_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -8771,8 +8366,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -8785,8 +8380,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -8797,8 +8392,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -8809,8 +8404,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -8823,8 +8418,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8835,8 +8430,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8847,8 +8442,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8859,8 +8454,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -8871,8 +8466,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -8883,8 +8478,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -8895,8 +8490,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -8907,8 +8502,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -8926,8 +8521,8 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX6-LABEL: local_wavefront_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -8942,8 +8537,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -8956,8 +8551,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -8968,8 +8563,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -8980,8 +8575,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -8994,8 +8589,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9006,8 +8601,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9018,8 +8613,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9030,8 +8625,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9042,8 +8637,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9054,8 +8649,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9066,8 +8661,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9078,8 +8673,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9097,8 +8692,8 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9113,8 +8708,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9127,8 +8722,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -9139,8 +8734,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -9151,8 +8746,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9165,8 +8760,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9177,8 +8772,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9189,8 +8784,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9201,8 +8796,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9213,8 +8808,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9225,8 +8820,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9237,8 +8832,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9249,8 +8844,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9268,9 +8863,9 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX6-LABEL: local_wavefront_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -9280,8 +8875,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; GFX7-LABEL: local_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9291,8 +8886,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -9301,8 +8896,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; GFX10-CU-LABEL: local_wavefront_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -9311,8 +8906,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9322,8 +8917,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9332,8 +8927,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9342,8 +8937,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9352,8 +8947,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9362,8 +8957,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9372,8 +8967,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; GFX11-CU-LABEL: local_wavefront_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9382,8 +8977,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9392,8 +8987,8 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; GFX12-CU-LABEL: local_wavefront_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9408,9 +9003,9 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX6-LABEL: local_wavefront_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -9420,8 +9015,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9431,8 +9026,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -9441,8 +9036,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -9451,8 +9046,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9462,8 +9057,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9472,8 +9067,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9482,8 +9077,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9492,8 +9087,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9502,8 +9097,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9512,8 +9107,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9522,8 +9117,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9532,8 +9127,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9548,9 +9143,9 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX6-LABEL: local_wavefront_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -9560,8 +9155,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; GFX7-LABEL: local_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9571,8 +9166,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -9581,8 +9176,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -9591,8 +9186,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9602,8 +9197,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9612,8 +9207,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9622,8 +9217,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9632,8 +9227,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9642,8 +9237,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9652,8 +9247,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9662,8 +9257,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9672,8 +9267,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; GFX12-CU-LABEL: local_wavefront_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9688,9 +9283,9 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -9700,8 +9295,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9711,8 +9306,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -9721,8 +9316,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -9731,8 +9326,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9742,8 +9337,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9752,8 +9347,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9762,8 +9357,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9772,8 +9367,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9782,8 +9377,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9792,8 +9387,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9802,8 +9397,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9812,8 +9407,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9828,9 +9423,9 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -9840,8 +9435,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9851,8 +9446,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -9861,8 +9456,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -9871,8 +9466,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9882,8 +9477,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9892,8 +9487,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -9902,8 +9497,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9912,8 +9507,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -9922,8 +9517,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9932,8 +9527,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9942,8 +9537,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -9952,8 +9547,8 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -9968,9 +9563,9 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -9980,8 +9575,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9991,8 +9586,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10001,8 +9596,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10011,8 +9606,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10022,8 +9617,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10032,8 +9627,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10042,8 +9637,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10052,8 +9647,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10062,8 +9657,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10072,8 +9667,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10082,8 +9677,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10092,8 +9687,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10108,9 +9703,9 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10120,8 +9715,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; GFX7-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10131,8 +9726,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10141,8 +9736,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10151,8 +9746,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10162,8 +9757,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10172,8 +9767,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10182,8 +9777,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10192,8 +9787,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10202,8 +9797,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10212,8 +9807,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10222,8 +9817,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10232,8 +9827,8 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10248,9 +9843,9 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10260,8 +9855,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10271,8 +9866,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10281,8 +9876,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10291,8 +9886,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10302,8 +9897,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10312,8 +9907,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10322,8 +9917,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10332,8 +9927,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10342,8 +9937,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10352,8 +9947,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10362,8 +9957,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10372,8 +9967,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10388,9 +9983,9 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10400,8 +9995,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10411,8 +10006,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10421,8 +10016,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10431,8 +10026,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10442,8 +10037,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10452,8 +10047,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10462,8 +10057,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10472,8 +10067,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10482,8 +10077,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10492,8 +10087,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10502,8 +10097,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10512,8 +10107,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10528,9 +10123,9 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -10544,8 +10139,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -10559,8 +10154,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -10572,8 +10167,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -10585,8 +10180,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -10600,8 +10195,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -10613,8 +10208,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -10626,8 +10221,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -10639,8 +10234,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -10652,8 +10247,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -10665,8 +10260,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -10678,8 +10273,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -10691,8 +10286,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -10711,9 +10306,9 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -10727,8 +10322,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -10742,8 +10337,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -10755,8 +10350,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -10768,8 +10363,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -10783,8 +10378,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -10796,8 +10391,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -10809,8 +10404,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -10822,8 +10417,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -10835,8 +10430,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -10848,8 +10443,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -10861,8 +10456,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -10874,8 +10469,8 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -10894,9 +10489,9 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -10910,8 +10505,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -10925,8 +10520,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -10938,8 +10533,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -10951,8 +10546,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -10966,8 +10561,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -10979,8 +10574,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -10992,8 +10587,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11005,8 +10600,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11018,8 +10613,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11031,8 +10626,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11044,8 +10639,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11057,8 +10652,8 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11077,7 +10672,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11092,7 +10686,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11106,7 +10699,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11119,7 +10711,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11132,7 +10723,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -11146,7 +10736,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11159,7 +10748,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11172,7 +10760,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11185,7 +10772,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11198,7 +10784,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11211,7 +10796,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11224,8 +10808,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11238,8 +10820,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11259,7 +10839,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11274,7 +10853,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11288,7 +10866,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11301,7 +10878,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11314,7 +10890,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -11328,7 +10903,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11341,7 +10915,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11354,7 +10927,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11367,7 +10939,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11380,7 +10951,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11393,7 +10963,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11406,8 +10975,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11420,8 +10987,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11441,7 +11006,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11456,7 +11020,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11470,7 +11033,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11483,7 +11045,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11496,7 +11057,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -11510,7 +11070,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11523,7 +11082,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11536,7 +11094,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11549,7 +11106,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11562,7 +11118,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11575,7 +11130,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11588,8 +11142,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11602,8 +11154,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11623,7 +11173,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11638,7 +11187,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11652,7 +11200,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11665,7 +11212,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11678,7 +11224,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -11692,7 +11237,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11705,7 +11249,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11718,7 +11261,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11731,7 +11273,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11744,7 +11285,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11757,7 +11297,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11770,8 +11309,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11784,8 +11321,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11805,7 +11340,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11820,7 +11354,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11834,7 +11367,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11847,7 +11379,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11860,7 +11391,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -11874,7 +11404,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11887,7 +11416,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -11900,7 +11428,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11913,7 +11440,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -11926,7 +11452,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11939,7 +11464,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11952,8 +11476,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11966,8 +11488,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11987,7 +11507,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12002,7 +11521,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12016,7 +11534,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12029,7 +11546,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12042,7 +11558,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12056,7 +11571,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12069,7 +11583,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12082,7 +11595,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12095,7 +11607,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12108,7 +11619,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12121,7 +11631,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12134,8 +11643,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12148,8 +11655,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12169,7 +11674,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12184,7 +11688,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12198,7 +11701,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12211,7 +11713,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12224,7 +11725,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12238,7 +11738,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12251,7 +11750,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12264,7 +11762,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12277,7 +11774,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12290,7 +11786,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12303,7 +11798,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12316,8 +11810,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12330,8 +11822,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12351,7 +11841,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12366,7 +11855,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12380,7 +11868,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12393,7 +11880,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12406,7 +11892,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12420,7 +11905,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12433,7 +11917,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12446,7 +11929,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12459,7 +11941,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12472,7 +11953,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12485,7 +11965,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12498,8 +11977,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12512,8 +11989,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12533,7 +12008,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12548,7 +12022,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12562,7 +12035,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12575,7 +12047,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12588,7 +12059,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12602,7 +12072,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12615,7 +12084,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12628,7 +12096,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12641,7 +12108,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12654,7 +12120,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12667,7 +12132,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12680,8 +12144,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12694,8 +12156,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12715,7 +12175,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12730,7 +12189,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12744,7 +12202,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12757,7 +12214,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12770,7 +12226,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12784,7 +12239,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12797,7 +12251,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12810,7 +12263,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12823,7 +12275,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12836,7 +12287,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12849,7 +12299,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12862,8 +12311,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12876,8 +12323,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12897,7 +12342,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12912,7 +12356,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12926,7 +12369,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12939,7 +12381,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12952,7 +12393,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12966,7 +12406,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12979,7 +12418,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12992,7 +12430,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13005,7 +12442,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13018,7 +12454,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13031,7 +12466,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13044,8 +12478,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13058,8 +12490,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13079,7 +12509,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13094,7 +12523,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13108,7 +12536,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13121,7 +12548,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13134,7 +12560,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13148,7 +12573,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13161,7 +12585,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13174,7 +12597,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13187,7 +12609,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13200,7 +12621,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13213,7 +12633,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13226,8 +12645,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13240,8 +12657,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13261,7 +12676,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13276,7 +12690,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13290,7 +12703,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13303,7 +12715,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13316,7 +12727,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13330,7 +12740,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13343,7 +12752,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13356,7 +12764,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13369,7 +12776,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13382,7 +12788,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13395,7 +12800,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13408,8 +12812,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13422,8 +12824,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13443,7 +12843,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13458,7 +12857,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13472,7 +12870,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13485,7 +12882,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13498,7 +12894,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13512,7 +12907,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13525,7 +12919,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13538,7 +12931,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13551,7 +12943,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13564,7 +12955,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13577,7 +12967,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13590,8 +12979,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13604,8 +12991,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13625,7 +13010,6 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13640,7 +13024,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13654,7 +13037,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13667,7 +13049,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13680,7 +13061,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13694,7 +13074,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13707,7 +13086,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13720,7 +13098,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13733,7 +13110,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13746,7 +13122,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13759,7 +13134,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13772,8 +13146,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13786,8 +13158,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13807,10 +13177,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -13825,9 +13195,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -13842,10 +13212,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -13858,10 +13227,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -13874,9 +13242,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -13891,10 +13259,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -13907,10 +13274,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -13923,10 +13289,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -13939,10 +13304,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -13955,7 +13319,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -13971,7 +13334,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -13987,8 +13349,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14004,8 +13364,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14030,10 +13388,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14048,9 +13406,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14065,10 +13423,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14081,10 +13438,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14097,9 +13453,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14114,10 +13470,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14130,10 +13485,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14146,10 +13500,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14162,10 +13515,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14178,7 +13530,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14194,7 +13545,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14210,8 +13560,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14227,8 +13575,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14253,10 +13599,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14271,9 +13617,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14288,10 +13634,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14304,10 +13649,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14320,9 +13664,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14337,10 +13681,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14353,10 +13696,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14369,10 +13711,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14385,10 +13726,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14401,7 +13741,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14417,7 +13756,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14433,8 +13771,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14450,8 +13786,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14476,10 +13810,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14494,9 +13828,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,10 +13845,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14527,10 +13860,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14543,9 +13875,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14560,10 +13892,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14576,10 +13907,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14592,10 +13922,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14608,10 +13937,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14624,7 +13952,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14640,7 +13967,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14656,8 +13982,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14673,8 +13997,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14699,10 +14021,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14717,9 +14039,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14734,10 +14056,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14750,10 +14071,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14766,9 +14086,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14783,10 +14103,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14799,10 +14118,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14815,10 +14133,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14831,10 +14148,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14847,7 +14163,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14863,7 +14178,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14879,8 +14193,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14896,8 +14208,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14922,10 +14232,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14940,9 +14250,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14957,10 +14267,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14973,10 +14282,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14989,9 +14297,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15006,10 +14314,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15022,10 +14329,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15038,10 +14344,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15054,10 +14359,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15070,7 +14374,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15086,7 +14389,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15102,8 +14404,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15119,8 +14419,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15145,10 +14443,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15163,9 +14461,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15180,10 +14478,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15196,10 +14493,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15212,9 +14508,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15229,10 +14525,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15245,10 +14540,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15261,10 +14555,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15277,10 +14570,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15293,7 +14585,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15309,7 +14600,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15325,8 +14615,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15342,8 +14630,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15368,10 +14654,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15386,9 +14672,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15403,10 +14689,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15419,10 +14704,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15435,9 +14719,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15452,10 +14736,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15468,10 +14751,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15484,10 +14766,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15500,10 +14781,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15516,7 +14796,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15532,7 +14811,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15548,8 +14826,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15565,8 +14841,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15591,10 +14865,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15609,9 +14883,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15626,10 +14900,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15642,10 +14915,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15658,9 +14930,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15675,10 +14947,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15691,10 +14962,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15707,10 +14977,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15723,10 +14992,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15739,7 +15007,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15755,7 +15022,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15771,8 +15037,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15788,8 +15052,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15814,10 +15076,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15832,9 +15094,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15849,10 +15111,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15865,10 +15126,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15881,9 +15141,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15898,10 +15158,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15914,10 +15173,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15930,10 +15188,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15946,10 +15203,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15962,7 +15218,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15978,7 +15233,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15994,8 +15248,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16011,8 +15263,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16037,10 +15287,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16055,9 +15305,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16072,10 +15322,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16088,10 +15337,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16104,9 +15352,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16121,10 +15369,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16137,10 +15384,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16153,10 +15399,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16169,10 +15414,9 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16185,7 +15429,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16201,7 +15444,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16217,8 +15459,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16234,8 +15474,6 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16260,10 +15498,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16278,9 +15516,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16295,10 +15533,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16311,10 +15548,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16327,9 +15563,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16344,10 +15580,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16360,10 +15595,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16376,10 +15610,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16392,10 +15625,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16408,7 +15640,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16424,7 +15655,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16440,8 +15670,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16457,8 +15685,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16483,10 +15709,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16501,9 +15727,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16518,10 +15744,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16534,10 +15759,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16550,9 +15774,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16567,10 +15791,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16583,10 +15806,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16599,10 +15821,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16615,10 +15836,9 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16631,7 +15851,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16647,7 +15866,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16663,8 +15881,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16680,8 +15896,6 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16706,10 +15920,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16724,9 +15938,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16741,10 +15955,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16757,10 +15970,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16773,9 +15985,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16790,10 +16002,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16806,10 +16017,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16822,10 +16032,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16838,10 +16047,9 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16854,7 +16062,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16870,7 +16077,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16886,8 +16092,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16903,8 +16107,6 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16929,10 +16131,10 @@ entry: define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16947,9 +16149,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16964,10 +16166,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16980,10 +16181,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16996,9 +16196,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17013,10 +16213,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17029,10 +16228,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17045,10 +16243,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17061,10 +16258,9 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17077,7 +16273,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17093,7 +16288,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17109,8 +16303,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17126,8 +16318,6 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index af6033c844209d..0c79e0bfca9dfe 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -16,8 +16,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; GFX6-LABEL: local_workgroup_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -32,8 +32,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX7-LABEL: local_workgroup_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -46,8 +46,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX10-WGP-LABEL: local_workgroup_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -58,8 +58,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX10-CU-LABEL: local_workgroup_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -70,8 +70,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -84,8 +84,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -96,8 +96,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -108,8 +108,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -120,8 +120,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -132,8 +132,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX11-WGP-LABEL: local_workgroup_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -144,8 +144,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX11-CU-LABEL: local_workgroup_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -156,8 +156,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX12-WGP-LABEL: local_workgroup_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -168,8 +168,8 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX12-CU-LABEL: local_workgroup_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -187,8 +187,8 @@ entry: define amdgpu_kernel void @local_workgroup_monotonic_load( ; GFX6-LABEL: local_workgroup_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -203,8 +203,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX7-LABEL: local_workgroup_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -217,8 +217,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -229,8 +229,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX10-CU-LABEL: local_workgroup_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -241,8 +241,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -255,8 +255,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -267,8 +267,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -279,8 +279,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -291,8 +291,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -303,8 +303,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -315,8 +315,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX11-CU-LABEL: local_workgroup_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -327,8 +327,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -339,8 +339,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX12-CU-LABEL: local_workgroup_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -358,8 +358,8 @@ entry: define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX6-LABEL: local_workgroup_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -374,8 +374,8 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX7-LABEL: local_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -388,8 +388,8 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -401,8 +401,8 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX10-CU-LABEL: local_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -413,8 +413,8 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -427,8 +427,8 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -439,8 +439,8 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -452,8 +452,8 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -464,8 +464,8 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -477,8 +477,8 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -490,8 +490,8 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX11-CU-LABEL: local_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -502,8 +502,8 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -515,8 +515,8 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX12-CU-LABEL: local_workgroup_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -534,8 +534,8 @@ entry: define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX6-LABEL: local_workgroup_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -551,8 +551,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX7-LABEL: local_workgroup_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -566,8 +566,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -581,8 +581,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -594,8 +594,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -609,8 +609,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -622,8 +622,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -636,8 +636,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -649,8 +649,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -663,8 +663,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -678,8 +678,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -691,8 +691,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 @@ -708,8 +708,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 @@ -728,9 +728,9 @@ entry: define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX6-LABEL: local_workgroup_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -740,8 +740,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; GFX7-LABEL: local_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -751,8 +751,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; GFX10-WGP-LABEL: local_workgroup_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -761,8 +761,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; GFX10-CU-LABEL: local_workgroup_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -771,8 +771,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -782,8 +782,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -792,8 +792,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -802,8 +802,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -812,8 +812,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -822,8 +822,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; GFX11-WGP-LABEL: local_workgroup_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -832,8 +832,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; GFX11-CU-LABEL: local_workgroup_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -842,8 +842,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; GFX12-WGP-LABEL: local_workgroup_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -852,8 +852,8 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; GFX12-CU-LABEL: local_workgroup_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -868,9 +868,9 @@ entry: define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX6-LABEL: local_workgroup_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -880,8 +880,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; GFX7-LABEL: local_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -891,8 +891,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -901,8 +901,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; GFX10-CU-LABEL: local_workgroup_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -911,8 +911,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -922,8 +922,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -932,8 +932,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -942,8 +942,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -952,8 +952,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -962,8 +962,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -972,8 +972,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; GFX11-CU-LABEL: local_workgroup_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -982,8 +982,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -992,8 +992,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; GFX12-CU-LABEL: local_workgroup_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1008,9 +1008,9 @@ entry: define amdgpu_kernel void @local_workgroup_release_store( ; GFX6-LABEL: local_workgroup_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1021,8 +1021,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; GFX7-LABEL: local_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1033,8 +1033,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; GFX10-WGP-LABEL: local_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1045,8 +1045,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; GFX10-CU-LABEL: local_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1056,8 +1056,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1068,8 +1068,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1079,8 +1079,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1090,8 +1090,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1101,8 +1101,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1112,8 +1112,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; GFX11-WGP-LABEL: local_workgroup_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1124,8 +1124,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; GFX11-CU-LABEL: local_workgroup_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1135,8 +1135,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; GFX12-WGP-LABEL: local_workgroup_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1149,8 +1149,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; GFX12-CU-LABEL: local_workgroup_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1166,9 +1166,9 @@ entry: define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX6-LABEL: local_workgroup_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1179,8 +1179,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; GFX7-LABEL: local_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1191,8 +1191,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1203,8 +1203,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1214,8 +1214,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1226,8 +1226,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1237,8 +1237,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1248,8 +1248,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1259,8 +1259,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1270,8 +1270,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1282,8 +1282,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1293,8 +1293,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1307,8 +1307,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1324,9 +1324,9 @@ entry: define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; GFX6-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1336,8 +1336,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; GFX7-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1347,8 +1347,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1357,8 +1357,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1367,8 +1367,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1378,8 +1378,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1388,8 +1388,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1398,8 +1398,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1408,8 +1408,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1418,8 +1418,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1428,8 +1428,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1438,8 +1438,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1448,8 +1448,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1464,9 +1464,9 @@ entry: define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX6-LABEL: local_workgroup_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1477,8 +1477,8 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX7-LABEL: local_workgroup_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1489,8 +1489,8 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1501,8 +1501,8 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1512,8 +1512,8 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1524,8 +1524,8 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1535,8 +1535,8 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1546,8 +1546,8 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1557,8 +1557,8 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1568,8 +1568,8 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1580,8 +1580,8 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1591,8 +1591,8 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1603,8 +1603,8 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1620,9 +1620,9 @@ entry: define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX6-LABEL: local_workgroup_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1633,8 +1633,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX7-LABEL: local_workgroup_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1645,8 +1645,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1657,8 +1657,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1668,8 +1668,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1680,8 +1680,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1691,8 +1691,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1702,8 +1702,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1713,8 +1713,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1724,8 +1724,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1736,8 +1736,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1747,8 +1747,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1761,8 +1761,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1778,9 +1778,9 @@ entry: define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX6-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1792,8 +1792,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1805,8 +1805,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1819,8 +1819,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -1831,8 +1831,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -1844,8 +1844,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1856,8 +1856,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -1868,8 +1868,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1880,8 +1880,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -1892,8 +1892,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1906,8 +1906,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1918,8 +1918,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -1934,8 +1934,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -1952,9 +1952,9 @@ entry: define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX6-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -1966,8 +1966,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -1979,8 +1979,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -1993,8 +1993,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -2005,8 +2005,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -2018,8 +2018,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -2030,8 +2030,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -2042,8 +2042,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -2054,8 +2054,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -2066,8 +2066,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -2080,8 +2080,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -2092,8 +2092,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -2108,8 +2108,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -2126,9 +2126,9 @@ entry: define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2142,8 +2142,8 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; GFX7-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2157,8 +2157,8 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2171,8 +2171,8 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2184,8 +2184,8 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2199,8 +2199,8 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2212,8 +2212,8 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2226,8 +2226,8 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2239,8 +2239,8 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2253,8 +2253,8 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2267,8 +2267,8 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2280,8 +2280,8 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2294,8 +2294,8 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2314,9 +2314,9 @@ entry: define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2331,8 +2331,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2347,8 +2347,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2363,8 +2363,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2377,8 +2377,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2393,8 +2393,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2407,8 +2407,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2422,8 +2422,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2436,8 +2436,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2451,8 +2451,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2467,8 +2467,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2481,8 +2481,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2499,8 +2499,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2520,9 +2520,9 @@ entry: define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -2537,8 +2537,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2553,8 +2553,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -2569,8 +2569,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -2583,8 +2583,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -2599,8 +2599,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2613,8 +2613,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -2628,8 +2628,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2642,8 +2642,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -2657,8 +2657,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2673,8 +2673,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2687,8 +2687,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -2705,8 +2705,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -2726,7 +2726,6 @@ entry: define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -2741,7 +2740,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -2755,7 +2753,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2768,7 +2765,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2781,7 +2777,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -2795,7 +2790,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2808,7 +2802,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2821,7 +2814,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2834,7 +2826,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -2847,7 +2838,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2860,7 +2850,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2873,8 +2862,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2887,8 +2874,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -2908,7 +2893,6 @@ entry: define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -2924,7 +2908,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -2939,7 +2922,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2954,7 +2936,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2968,7 +2949,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -2983,7 +2963,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -2997,7 +2976,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3011,7 +2989,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3025,7 +3002,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3039,7 +3015,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3054,7 +3029,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3068,8 +3042,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3084,8 +3056,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3106,7 +3076,6 @@ entry: define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3122,7 +3091,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3137,7 +3105,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3152,7 +3119,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3166,7 +3132,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3181,7 +3146,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3195,7 +3159,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3209,7 +3172,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3223,7 +3185,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3237,7 +3198,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3252,7 +3212,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3266,8 +3225,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3284,8 +3241,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3306,7 +3261,6 @@ entry: define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3323,7 +3277,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3339,7 +3292,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3356,7 +3308,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3371,7 +3322,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3387,7 +3337,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3402,7 +3351,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3417,7 +3365,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3432,7 +3379,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3447,7 +3393,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3464,7 +3409,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3479,8 +3423,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3499,8 +3441,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3522,7 +3462,6 @@ entry: define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3539,7 +3478,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3555,7 +3493,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3572,7 +3509,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3587,7 +3523,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3603,7 +3538,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3618,7 +3552,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3633,7 +3566,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3648,7 +3580,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3663,7 +3594,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3680,7 +3610,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3695,8 +3624,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3715,8 +3642,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3738,7 +3663,6 @@ entry: define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3754,7 +3678,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3769,7 +3692,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3784,7 +3706,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3798,7 +3719,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -3813,7 +3733,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3827,7 +3746,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3841,7 +3759,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3855,7 +3772,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -3869,7 +3785,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3884,7 +3799,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3898,8 +3812,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3914,8 +3826,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3936,7 +3846,6 @@ entry: define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -3952,7 +3861,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -3967,7 +3875,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3982,7 +3889,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -3996,7 +3902,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4011,7 +3916,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4025,7 +3929,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4039,7 +3942,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4053,7 +3955,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4067,7 +3968,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4082,7 +3982,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4096,8 +3995,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4112,8 +4009,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4134,7 +4029,6 @@ entry: define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4151,7 +4045,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4167,7 +4060,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4184,7 +4076,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4199,7 +4090,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4215,7 +4105,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4230,7 +4119,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4245,7 +4133,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4260,7 +4147,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4275,7 +4161,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4292,7 +4177,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4307,8 +4191,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4327,8 +4209,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4350,7 +4230,6 @@ entry: define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4367,7 +4246,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4383,7 +4261,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4400,7 +4277,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4415,7 +4291,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4431,7 +4306,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4446,7 +4320,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4461,7 +4334,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4476,7 +4348,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4491,7 +4362,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4508,7 +4378,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4523,8 +4392,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4543,8 +4410,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4566,7 +4431,6 @@ entry: define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4583,7 +4447,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4599,7 +4462,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4616,7 +4478,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4631,7 +4492,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4647,7 +4507,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4662,7 +4521,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4677,7 +4535,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4692,7 +4549,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4707,7 +4563,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4724,7 +4579,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4739,8 +4593,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4759,8 +4611,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4782,7 +4632,6 @@ entry: define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -4799,7 +4648,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -4815,7 +4663,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4832,7 +4679,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4847,7 +4693,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -4863,7 +4708,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4878,7 +4722,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -4893,7 +4736,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4908,7 +4750,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -4923,7 +4764,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4940,7 +4780,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4955,8 +4794,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4975,8 +4812,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4998,7 +4833,6 @@ entry: define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5015,7 +4849,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5031,7 +4864,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5048,7 +4880,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5063,7 +4894,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5079,7 +4909,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5094,7 +4923,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5109,7 +4937,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5124,7 +4951,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5139,7 +4965,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5156,7 +4981,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5171,8 +4995,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5191,8 +5013,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5214,7 +5034,6 @@ entry: define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5231,7 +5050,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5247,7 +5065,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5264,7 +5081,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5279,7 +5095,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5295,7 +5110,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5310,7 +5124,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5325,7 +5138,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5340,7 +5152,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5355,7 +5166,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5372,7 +5182,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5387,8 +5196,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5407,8 +5214,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5430,7 +5235,6 @@ entry: define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5447,7 +5251,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5463,7 +5266,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5480,7 +5282,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5495,7 +5296,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5511,7 +5311,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5526,7 +5325,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5541,7 +5339,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5556,7 +5353,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5571,7 +5367,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5588,7 +5383,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5603,8 +5397,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5623,8 +5415,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5646,7 +5436,6 @@ entry: define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -5663,7 +5452,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -5679,7 +5467,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5696,7 +5483,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5711,7 +5497,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -5727,7 +5512,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5742,7 +5526,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -5757,7 +5540,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5772,7 +5554,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -5787,7 +5568,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5804,7 +5584,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5819,8 +5598,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5839,8 +5616,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5862,10 +5637,10 @@ entry: define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -5880,9 +5655,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -5897,10 +5672,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -5913,10 +5687,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -5929,9 +5702,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -5946,10 +5719,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5962,10 +5734,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -5978,10 +5749,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -5994,10 +5764,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6010,7 +5779,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6026,7 +5794,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6042,8 +5809,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6059,8 +5824,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6085,10 +5848,10 @@ entry: define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6103,9 +5866,9 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6120,10 +5883,9 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6137,10 +5899,9 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6153,9 +5914,9 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6170,10 +5931,9 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6186,10 +5946,9 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6203,10 +5962,9 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6219,10 +5977,9 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6236,7 +5993,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6253,7 +6009,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6269,8 +6024,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6287,8 +6040,6 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6313,10 +6064,10 @@ entry: define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6332,9 +6083,9 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6350,10 +6101,9 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6368,10 +6118,9 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6385,9 +6134,9 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6403,10 +6152,9 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6420,10 +6168,9 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6437,10 +6184,9 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6454,10 +6200,9 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6471,7 +6216,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6489,7 +6233,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6506,8 +6249,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6527,8 +6268,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6554,10 +6293,10 @@ entry: define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6573,9 +6312,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6591,10 +6330,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6610,10 +6348,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6627,9 +6364,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6645,10 +6382,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6662,10 +6398,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6680,10 +6415,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6697,10 +6431,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6715,7 +6448,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6734,7 +6466,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6751,8 +6482,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6773,8 +6502,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6800,10 +6527,10 @@ entry: define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -6819,9 +6546,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -6837,10 +6564,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -6856,10 +6582,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -6873,9 +6598,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -6891,10 +6616,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6908,10 +6632,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -6926,10 +6649,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6943,10 +6665,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -6961,7 +6682,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6980,7 +6700,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6997,8 +6716,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7019,8 +6736,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7046,10 +6761,10 @@ entry: define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7064,9 +6779,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7081,10 +6796,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7098,10 +6812,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7114,9 +6827,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7131,10 +6844,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7147,10 +6859,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7164,10 +6875,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7180,10 +6890,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7197,7 +6906,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7214,7 +6922,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7230,8 +6937,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7248,8 +6953,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7274,10 +6977,10 @@ entry: define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7292,9 +6995,9 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7309,10 +7012,9 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7326,10 +7028,9 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7342,9 +7043,9 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7359,10 +7060,9 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7375,10 +7075,9 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7392,10 +7091,9 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7408,10 +7106,9 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7425,7 +7122,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7442,7 +7138,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7458,8 +7153,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7476,8 +7169,6 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7502,10 +7193,10 @@ entry: define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7521,9 +7212,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7539,10 +7230,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7558,10 +7248,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7575,9 +7264,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7593,10 +7282,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7610,10 +7298,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7628,10 +7315,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7645,10 +7331,9 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7663,7 +7348,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7682,7 +7366,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7699,8 +7382,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7721,8 +7402,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7748,10 +7427,10 @@ entry: define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -7767,9 +7446,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -7785,10 +7464,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -7804,10 +7482,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -7821,9 +7498,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -7839,10 +7516,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7856,10 +7532,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -7874,10 +7549,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7891,10 +7565,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -7909,7 +7582,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7928,7 +7600,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7945,8 +7616,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7967,8 +7636,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7994,10 +7661,10 @@ entry: define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8013,9 +7680,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8031,10 +7698,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8050,10 +7716,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8067,9 +7732,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8085,10 +7750,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8102,10 +7766,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8120,10 +7783,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8137,10 +7799,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8155,7 +7816,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8174,7 +7834,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8191,8 +7850,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8213,8 +7870,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8240,10 +7895,10 @@ entry: define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8259,9 +7914,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8277,10 +7932,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8296,10 +7950,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8313,9 +7966,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8331,10 +7984,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8348,10 +8000,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8366,10 +8017,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8383,10 +8033,9 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8401,7 +8050,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8420,7 +8068,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8437,8 +8084,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8459,8 +8104,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8486,10 +8129,10 @@ entry: define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8505,9 +8148,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8523,10 +8166,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8542,10 +8184,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8559,9 +8200,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8577,10 +8218,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8594,10 +8234,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8612,10 +8251,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8629,10 +8267,9 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8647,7 +8284,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8666,7 +8302,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8683,8 +8318,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8705,8 +8338,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8732,10 +8363,10 @@ entry: define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8751,9 +8382,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -8769,10 +8400,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -8788,10 +8418,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -8805,9 +8434,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -8823,10 +8452,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8840,10 +8468,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -8858,10 +8485,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8875,10 +8501,9 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -8893,7 +8518,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8912,7 +8536,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8929,8 +8552,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8951,8 +8572,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8978,10 +8597,10 @@ entry: define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -8997,9 +8616,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -9015,10 +8634,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -9034,10 +8652,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -9051,9 +8668,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -9069,10 +8686,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -9086,10 +8702,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -9104,10 +8719,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -9121,10 +8735,9 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -9139,7 +8752,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9158,7 +8770,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9175,8 +8786,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9197,8 +8806,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9224,10 +8831,10 @@ entry: define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -9243,9 +8850,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -9261,10 +8868,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -9280,10 +8886,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -9297,9 +8902,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -9315,10 +8920,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -9332,10 +8936,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -9350,10 +8953,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -9367,10 +8969,9 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -9385,7 +8986,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9404,7 +9004,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9421,8 +9020,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9443,8 +9040,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9470,8 +9065,8 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; GFX6-LABEL: local_workgroup_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9486,8 +9081,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX7-LABEL: local_workgroup_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9500,8 +9095,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -9512,8 +9107,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX10-CU-LABEL: local_workgroup_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -9524,8 +9119,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9538,8 +9133,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9550,8 +9145,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9562,8 +9157,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9574,8 +9169,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9586,8 +9181,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_unordered_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9598,8 +9193,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX11-CU-LABEL: local_workgroup_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9610,8 +9205,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_unordered_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9622,8 +9217,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX12-CU-LABEL: local_workgroup_one_as_unordered_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9641,8 +9236,8 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; GFX6-LABEL: local_workgroup_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9657,8 +9252,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9671,8 +9266,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -9683,8 +9278,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -9695,8 +9290,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9709,8 +9304,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9721,8 +9316,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9733,8 +9328,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9745,8 +9340,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9757,8 +9352,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9769,8 +9364,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9781,8 +9376,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9793,8 +9388,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9812,8 +9407,8 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX6-LABEL: local_workgroup_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9828,8 +9423,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -9842,8 +9437,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -9854,8 +9449,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -9866,8 +9461,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -9880,8 +9475,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9892,8 +9487,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9904,8 +9499,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9916,8 +9511,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -9928,8 +9523,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -9940,8 +9535,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -9952,8 +9547,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -9964,8 +9559,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -9983,8 +9578,8 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 ; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 ; GFX6-NEXT: s_mov_b32 m0, -1 @@ -9999,8 +9594,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10013,8 +9608,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 @@ -10025,8 +9620,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: ds_read_b32 v1, v0 @@ -10037,8 +9632,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_load: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10051,8 +9646,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -10063,8 +9658,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -10075,8 +9670,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -10087,8 +9682,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v0 @@ -10099,8 +9694,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 @@ -10111,8 +9706,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: ds_load_b32 v1, v0 @@ -10123,8 +9718,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: ds_load_b32 v1, v0 @@ -10135,8 +9730,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: ds_load_b32 v1, v0 @@ -10154,9 +9749,9 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX6-LABEL: local_workgroup_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10166,8 +9761,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; GFX7-LABEL: local_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10177,8 +9772,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10187,8 +9782,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; GFX10-CU-LABEL: local_workgroup_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10197,8 +9792,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10208,8 +9803,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10218,8 +9813,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10228,8 +9823,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10238,8 +9833,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10248,8 +9843,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_unordered_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10258,8 +9853,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; GFX11-CU-LABEL: local_workgroup_one_as_unordered_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10268,8 +9863,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_unordered_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10278,8 +9873,8 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; GFX12-CU-LABEL: local_workgroup_one_as_unordered_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10294,9 +9889,9 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX6-LABEL: local_workgroup_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10306,8 +9901,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10317,8 +9912,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10327,8 +9922,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10337,8 +9932,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10348,8 +9943,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10358,8 +9953,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10368,8 +9963,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10378,8 +9973,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10388,8 +9983,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10398,8 +9993,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10408,8 +10003,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10418,8 +10013,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10434,9 +10029,9 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX6-LABEL: local_workgroup_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10446,8 +10041,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; GFX7-LABEL: local_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10457,8 +10052,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10467,8 +10062,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10477,8 +10072,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10488,8 +10083,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10498,8 +10093,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10508,8 +10103,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10518,8 +10113,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10528,8 +10123,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10538,8 +10133,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10548,8 +10143,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10558,8 +10153,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; GFX12-CU-LABEL: local_workgroup_one_as_release_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10574,9 +10169,9 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10586,8 +10181,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10597,8 +10192,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10607,8 +10202,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10617,8 +10212,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_store: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10628,8 +10223,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10638,8 +10233,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10648,8 +10243,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10658,8 +10253,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10668,8 +10263,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10678,8 +10273,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10688,8 +10283,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10698,8 +10293,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10714,9 +10309,9 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10726,8 +10321,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10737,8 +10332,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10747,8 +10342,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10757,8 +10352,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10768,8 +10363,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10778,8 +10373,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10788,8 +10383,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10798,8 +10393,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10808,8 +10403,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10818,8 +10413,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10828,8 +10423,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10838,8 +10433,8 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10854,9 +10449,9 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -10866,8 +10461,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -10877,8 +10472,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -10887,8 +10482,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -10897,8 +10492,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -10908,8 +10503,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10918,8 +10513,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -10928,8 +10523,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10938,8 +10533,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -10948,8 +10543,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10958,8 +10553,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10968,8 +10563,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -10978,8 +10573,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -10994,9 +10589,9 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -11006,8 +10601,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; GFX7-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -11017,8 +10612,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -11027,8 +10622,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -11037,8 +10632,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -11048,8 +10643,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11058,8 +10653,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11068,8 +10663,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11078,8 +10673,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11088,8 +10683,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11098,8 +10693,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11108,8 +10703,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11118,8 +10713,8 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11134,9 +10729,9 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -11146,8 +10741,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -11157,8 +10752,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -11167,8 +10762,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -11177,8 +10772,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -11188,8 +10783,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11198,8 +10793,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11208,8 +10803,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11218,8 +10813,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11228,8 +10823,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11238,8 +10833,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11248,8 +10843,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11258,8 +10853,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11274,9 +10869,9 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 @@ -11286,8 +10881,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 @@ -11297,8 +10892,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 @@ -11307,8 +10902,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 @@ -11317,8 +10912,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 @@ -11328,8 +10923,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11338,8 +10933,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 @@ -11348,8 +10943,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11358,8 +10953,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -11368,8 +10963,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11378,8 +10973,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11388,8 +10983,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 @@ -11398,8 +10993,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 @@ -11414,9 +11009,9 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -11430,8 +11025,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -11445,8 +11040,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -11458,8 +11053,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -11471,8 +11066,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -11486,8 +11081,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11499,8 +11094,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11512,8 +11107,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11525,8 +11120,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11538,8 +11133,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11551,8 +11146,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11564,8 +11159,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11577,8 +11172,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11597,9 +11192,9 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -11613,8 +11208,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -11628,8 +11223,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -11641,8 +11236,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -11654,8 +11249,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -11669,8 +11264,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11682,8 +11277,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11695,8 +11290,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11708,8 +11303,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11721,8 +11316,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11734,8 +11329,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11747,8 +11342,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11760,8 +11355,8 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11780,9 +11375,9 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -11796,8 +11391,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -11811,8 +11406,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 @@ -11824,8 +11419,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -11837,8 +11432,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -11852,8 +11447,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11865,8 +11460,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 @@ -11878,8 +11473,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11891,8 +11486,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 @@ -11904,8 +11499,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11917,8 +11512,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11930,8 +11525,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 @@ -11943,8 +11538,8 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -11963,7 +11558,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -11978,7 +11572,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -11992,7 +11585,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12005,7 +11597,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12018,7 +11609,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12032,7 +11622,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12045,7 +11634,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12058,7 +11646,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12071,7 +11658,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12084,7 +11670,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12097,7 +11682,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12110,8 +11694,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12124,8 +11706,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12145,7 +11725,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12160,7 +11739,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12174,7 +11752,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12187,7 +11764,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12200,7 +11776,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12214,7 +11789,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12227,7 +11801,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12240,7 +11813,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12253,7 +11825,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12266,7 +11837,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12279,7 +11849,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12292,8 +11861,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12306,8 +11873,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12327,7 +11892,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12342,7 +11906,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12356,7 +11919,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12369,7 +11931,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12382,7 +11943,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12396,7 +11956,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12409,7 +11968,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12422,7 +11980,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12435,7 +11992,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12448,7 +12004,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12461,7 +12016,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12474,8 +12028,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12488,8 +12040,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12509,7 +12059,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12524,7 +12073,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12538,7 +12086,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12551,7 +12098,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12564,7 +12110,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12578,7 +12123,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12591,7 +12135,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12604,7 +12147,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12617,7 +12159,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12630,7 +12171,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12643,7 +12183,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12656,8 +12195,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12670,8 +12207,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12691,7 +12226,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12706,7 +12240,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12720,7 +12253,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12733,7 +12265,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12746,7 +12277,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12760,7 +12290,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12773,7 +12302,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12786,7 +12314,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12799,7 +12326,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12812,7 +12338,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12825,7 +12350,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12838,8 +12362,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12852,8 +12374,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12873,7 +12393,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -12888,7 +12407,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -12902,7 +12420,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12915,7 +12432,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12928,7 +12444,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -12942,7 +12457,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12955,7 +12469,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -12968,7 +12481,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12981,7 +12493,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -12994,7 +12505,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13007,7 +12517,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13020,8 +12529,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13034,8 +12541,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13055,7 +12560,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13070,7 +12574,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13084,7 +12587,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13097,7 +12599,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13110,7 +12611,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13124,7 +12624,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13137,7 +12636,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13150,7 +12648,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13163,7 +12660,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13176,7 +12672,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13189,7 +12684,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13202,8 +12696,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13216,8 +12708,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13237,7 +12727,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13252,7 +12741,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13266,7 +12754,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13279,7 +12766,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13292,7 +12778,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13306,7 +12791,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13319,7 +12803,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13332,7 +12815,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13345,7 +12827,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13358,7 +12839,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13371,7 +12851,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13384,8 +12863,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13398,8 +12875,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13419,7 +12894,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13434,7 +12908,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13448,7 +12921,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13461,7 +12933,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13474,7 +12945,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13488,7 +12958,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13501,7 +12970,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13514,7 +12982,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13527,7 +12994,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13540,7 +13006,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13553,7 +13018,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13566,8 +13030,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13580,8 +13042,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13601,7 +13061,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13616,7 +13075,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13630,7 +13088,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13643,7 +13100,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13656,7 +13112,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13670,7 +13125,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13683,7 +13137,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13696,7 +13149,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13709,7 +13161,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13722,7 +13173,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13735,7 +13185,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13748,8 +13197,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13762,8 +13209,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13783,7 +13228,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13798,7 +13242,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13812,7 +13255,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13825,7 +13267,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13838,7 +13279,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -13852,7 +13292,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13865,7 +13304,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -13878,7 +13316,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13891,7 +13328,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -13904,7 +13340,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13917,7 +13352,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13930,8 +13364,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13944,8 +13376,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13965,7 +13395,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -13980,7 +13409,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -13994,7 +13422,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14007,7 +13434,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14020,7 +13446,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -14034,7 +13459,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14047,7 +13471,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14060,7 +13483,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14073,7 +13495,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14086,7 +13507,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14099,7 +13519,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14112,8 +13531,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14126,8 +13543,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14147,7 +13562,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -14162,7 +13576,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -14176,7 +13589,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14189,7 +13601,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14202,7 +13613,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -14216,7 +13626,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14229,7 +13638,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14242,7 +13650,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14255,7 +13662,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14268,7 +13674,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14281,7 +13686,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14294,8 +13698,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14308,8 +13710,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14329,7 +13729,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -14344,7 +13743,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -14358,7 +13756,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14371,7 +13768,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14384,7 +13780,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -14398,7 +13793,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14411,7 +13805,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14424,7 +13817,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14437,7 +13829,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14450,7 +13841,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14463,7 +13853,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14476,8 +13865,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14490,8 +13877,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14511,7 +13896,6 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 ; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 @@ -14526,7 +13910,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -14540,7 +13923,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14553,7 +13935,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14566,7 +13947,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -14580,7 +13960,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14593,7 +13972,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 @@ -14606,7 +13984,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14619,7 +13996,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 @@ -14632,7 +14008,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14645,7 +14020,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14658,8 +14032,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14672,8 +14044,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14693,10 +14063,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14711,9 +14081,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14728,10 +14098,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14744,10 +14113,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14760,9 +14128,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14777,10 +14145,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14793,10 +14160,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -14809,10 +14175,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14825,10 +14190,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -14841,7 +14205,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14857,7 +14220,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14873,8 +14235,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14890,8 +14250,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14916,10 +14274,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14934,9 +14292,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14951,10 +14309,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -14967,10 +14324,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -14983,9 +14339,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15000,10 +14356,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15016,10 +14371,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15032,10 +14386,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15048,10 +14401,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15064,7 +14416,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15080,7 +14431,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15096,8 +14446,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15113,8 +14461,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15139,10 +14485,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15157,9 +14503,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15174,10 +14520,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15190,10 +14535,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15206,9 +14550,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15223,10 +14567,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15239,10 +14582,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15255,10 +14597,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15271,10 +14612,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15287,7 +14627,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15303,7 +14642,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15319,8 +14657,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15336,8 +14672,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15362,10 +14696,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15380,9 +14714,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15397,10 +14731,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15413,10 +14746,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15429,9 +14761,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15446,10 +14778,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15462,10 +14793,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15478,10 +14808,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15494,10 +14823,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15510,7 +14838,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15526,7 +14853,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15542,8 +14868,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15559,8 +14883,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15585,10 +14907,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15603,9 +14925,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15620,10 +14942,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15636,10 +14957,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15652,9 +14972,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15669,10 +14989,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15685,10 +15004,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15701,10 +15019,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15717,10 +15034,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15733,7 +15049,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15749,7 +15064,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15765,8 +15079,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15782,8 +15094,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15808,10 +15118,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -15826,9 +15136,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -15843,10 +15153,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -15859,10 +15168,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -15875,9 +15183,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -15892,10 +15200,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15908,10 +15215,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -15924,10 +15230,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15940,10 +15245,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -15956,7 +15260,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15972,7 +15275,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15988,8 +15290,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16005,8 +15305,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16031,10 +15329,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16049,9 +15347,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16066,10 +15364,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16082,10 +15379,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16098,9 +15394,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16115,10 +15411,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16131,10 +15426,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16147,10 +15441,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16163,10 +15456,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16179,7 +15471,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16195,7 +15486,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16211,8 +15501,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16228,8 +15516,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16254,10 +15540,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16272,9 +15558,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16289,10 +15575,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16305,10 +15590,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16321,9 +15605,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16338,10 +15622,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16354,10 +15637,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16370,10 +15652,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16386,10 +15667,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16402,7 +15682,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16418,7 +15697,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16434,8 +15712,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16451,8 +15727,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16477,10 +15751,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16495,9 +15769,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16512,10 +15786,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16528,10 +15801,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16544,9 +15816,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16561,10 +15833,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16577,10 +15848,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16593,10 +15863,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16609,10 +15878,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16625,7 +15893,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16641,7 +15908,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16657,8 +15923,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16674,8 +15938,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16700,10 +15962,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16718,9 +15980,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16735,10 +15997,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16751,10 +16012,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16767,9 +16027,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16784,10 +16044,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16800,10 +16059,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -16816,10 +16074,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16832,10 +16089,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -16848,7 +16104,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16864,7 +16119,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16880,8 +16134,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16897,8 +16149,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16923,10 +16173,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16941,9 +16191,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16958,10 +16208,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -16974,10 +16223,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -16990,9 +16238,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17007,10 +16255,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17023,10 +16270,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17039,10 +16285,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17055,10 +16300,9 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17071,7 +16315,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17087,7 +16330,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17103,8 +16345,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17120,8 +16360,6 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17146,10 +16384,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -17164,9 +16402,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -17181,10 +16419,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -17197,10 +16434,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -17213,9 +16449,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17230,10 +16466,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17246,10 +16481,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17262,10 +16496,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17278,10 +16511,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17294,7 +16526,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17310,7 +16541,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17326,8 +16556,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17343,8 +16571,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17369,10 +16595,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -17387,9 +16613,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -17404,10 +16630,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -17420,10 +16645,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -17436,9 +16660,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17453,10 +16677,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17469,10 +16692,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17485,10 +16707,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17501,10 +16722,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17517,7 +16737,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17533,7 +16752,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17549,8 +16767,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17566,8 +16782,6 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17592,10 +16806,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -17610,9 +16824,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -17627,10 +16841,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -17643,10 +16856,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -17659,9 +16871,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17676,10 +16888,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17692,10 +16903,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17708,10 +16918,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17724,10 +16933,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17740,7 +16948,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17756,7 +16963,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17772,8 +16978,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17789,8 +16993,6 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17815,10 +17017,10 @@ entry: define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 -; GFX6-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -17833,9 +17035,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -17850,10 +17052,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -17866,10 +17067,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -17882,9 +17082,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x1 -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -17899,10 +17099,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17915,10 +17114,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x4 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -17931,10 +17129,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17947,10 +17144,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x4 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 @@ -17963,7 +17159,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17979,7 +17174,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17995,8 +17189,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18012,8 +17204,6 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll index 8e292fa5929756..f988a4d33add92 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll @@ -5,10 +5,8 @@ define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: private_last_use_load_0: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_LU @@ -24,11 +22,9 @@ entry: define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: private_last_use_load_1: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_mov_b32 s3, 0x3ff ; GFX12-NEXT: s_wait_alu 0xfffe @@ -52,10 +48,8 @@ entry: define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: private_last_use_and_volatile_load: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_BYPASS scope:SCOPE_SYS @@ -73,10 +67,8 @@ entry: define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: private_last_use_and_nontemporal_load: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_LU diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index c3599c87985bec..71f28efd478117 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -16,10 +16,11 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX6-LABEL: private_nontemporal_load_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_add_u32 s0, s0, s13 +; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -37,11 +38,10 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX7-LABEL: private_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc slc @@ -53,12 +53,10 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX10-WGP-LABEL: private_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -69,12 +67,10 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX10-CU-LABEL: private_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -89,10 +85,11 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -110,12 +107,10 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -126,12 +121,10 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -142,10 +135,8 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_0: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v1, off, s2 nt @@ -155,10 +146,8 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX940-TGSPLIT-LABEL: private_nontemporal_load_0: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: scratch_load_dword v1, off, s2 nt @@ -168,9 +157,8 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX11-WGP-LABEL: private_nontemporal_load_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v1, off, s2 slc dlc @@ -180,9 +168,8 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX11-CU-LABEL: private_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v1, off, s2 slc dlc @@ -192,10 +179,8 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX12-WGP-LABEL: private_nontemporal_load_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT @@ -205,10 +190,8 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX12-CU-LABEL: private_nontemporal_load_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT @@ -225,10 +208,11 @@ entry: define amdgpu_kernel void @private_nontemporal_load_1( ; GFX6-LABEL: private_nontemporal_load_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_add_u32 s0, s0, s13 +; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -248,11 +232,10 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX7-LABEL: private_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -266,13 +249,11 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX10-WGP-LABEL: private_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -284,13 +265,11 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX10-CU-LABEL: private_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -306,10 +285,11 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -329,13 +309,11 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 @@ -350,13 +328,11 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 @@ -371,11 +347,9 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_1: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff ; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 @@ -390,11 +364,9 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX940-TGSPLIT-LABEL: private_nontemporal_load_1: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff ; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 @@ -409,10 +381,9 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX11-WGP-LABEL: private_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 @@ -426,10 +397,9 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX11-CU-LABEL: private_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 @@ -443,11 +413,9 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX12-WGP-LABEL: private_nontemporal_load_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -463,11 +431,9 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX12-CU-LABEL: private_nontemporal_load_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -492,11 +458,10 @@ entry: define amdgpu_kernel void @private_nontemporal_store_0( ; GFX6-LABEL: private_nontemporal_store_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_add_u32 s0, s0, s13 +; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -507,11 +472,10 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX7-LABEL: private_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -522,12 +486,10 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX10-WGP-LABEL: private_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -538,12 +500,10 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX10-CU-LABEL: private_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -558,11 +518,10 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -573,12 +532,10 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -589,12 +546,10 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -605,10 +560,8 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_0: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -618,10 +571,8 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_0: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -631,9 +582,8 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX11-WGP-LABEL: private_nontemporal_store_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -643,9 +593,8 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX11-CU-LABEL: private_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -655,10 +604,8 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX12-WGP-LABEL: private_nontemporal_store_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -668,10 +615,8 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX12-CU-LABEL: private_nontemporal_store_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -688,11 +633,10 @@ entry: define amdgpu_kernel void @private_nontemporal_store_1( ; GFX6-LABEL: private_nontemporal_store_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_add_u32 s0, s0, s13 +; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s5, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX6-NEXT: s_mov_b32 s6, 2 @@ -705,11 +649,10 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX7-LABEL: private_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 2 @@ -722,11 +665,10 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX10-WGP-LABEL: private_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_mov_b32 s5, 2 @@ -738,11 +680,10 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX10-CU-LABEL: private_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-CU-NEXT: s_mov_b32 s5, 2 @@ -758,11 +699,10 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 @@ -775,11 +715,10 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff @@ -794,11 +733,10 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff @@ -813,9 +751,8 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_1: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff @@ -830,9 +767,8 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff @@ -847,8 +783,8 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX11-WGP-LABEL: private_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff @@ -862,8 +798,8 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX11-CU-LABEL: private_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff @@ -877,10 +813,8 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX12-WGP-LABEL: private_nontemporal_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff @@ -896,10 +830,8 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX12-CU-LABEL: private_nontemporal_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff @@ -924,10 +856,11 @@ entry: define amdgpu_kernel void @private_nontemporal_volatile_load( ; GFX6-LABEL: private_nontemporal_volatile_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_add_u32 s0, s0, s13 +; GFX6-NEXT: s_add_u32 s0, s0, s15 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_load_dword s8, s[6:7], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, s5 ; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 @@ -945,11 +878,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX7-LABEL: private_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc @@ -961,12 +893,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: private_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -977,12 +907,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: private_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -997,10 +925,11 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -1018,12 +947,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1034,12 +961,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s13 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 -; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_nop 0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 @@ -1050,10 +975,8 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_nop 0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v1, off, s2 sc0 sc1 @@ -1063,10 +986,8 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX940-TGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_nop 0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: scratch_load_dword v1, off, s2 sc0 sc1 @@ -1076,9 +997,8 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX11-WGP-LABEL: private_nontemporal_volatile_load: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v1, off, s2 glc dlc @@ -1088,9 +1008,8 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX11-CU-LABEL: private_nontemporal_volatile_load: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v1, off, s2 glc dlc @@ -1100,10 +1019,8 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX12-WGP-LABEL: private_nontemporal_volatile_load: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS @@ -1115,10 +1032,8 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX12-CU-LABEL: private_nontemporal_volatile_load: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index 9146f175eefcd1..3346a034f963f8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -16,10 +16,11 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX6-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX6-NEXT: s_add_u32 s12, s12, s9 +; GFX6-NEXT: s_add_u32 s12, s12, s11 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s7, s1 ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -37,11 +38,10 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX7-LABEL: private_volatile_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc @@ -53,12 +53,10 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX10-WGP-LABEL: private_volatile_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 @@ -69,12 +67,10 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX10-CU-LABEL: private_volatile_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 @@ -89,10 +85,11 @@ define amdgpu_kernel void @private_volatile_load_0( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -110,9 +107,8 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX11-WGP-LABEL: private_volatile_load_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v1, off, s2 glc dlc @@ -122,9 +118,8 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX11-CU-LABEL: private_volatile_load_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v1, off, s2 glc dlc @@ -134,10 +129,8 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX12-WGP-LABEL: private_volatile_load_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: scratch_load_b32 v1, off, s2 scope:SCOPE_SYS @@ -149,10 +142,8 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX12-CU-LABEL: private_volatile_load_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: scratch_load_b32 v1, off, s2 scope:SCOPE_SYS @@ -175,10 +166,11 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX6-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX6-NEXT: s_add_u32 s12, s12, s9 +; GFX6-NEXT: s_add_u32 s12, s12, s11 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s7, s1 ; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -198,11 +190,10 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX7-LABEL: private_volatile_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -216,13 +207,11 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX10-WGP-LABEL: private_volatile_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -234,13 +223,11 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX10-CU-LABEL: private_volatile_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -256,10 +243,11 @@ define amdgpu_kernel void @private_volatile_load_1( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 @@ -279,10 +267,9 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX11-WGP-LABEL: private_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 @@ -296,10 +283,9 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX11-CU-LABEL: private_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] ; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-CU-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff ; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 @@ -313,11 +299,9 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX12-WGP-LABEL: private_volatile_load_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff ; GFX12-WGP-NEXT: s_wait_alu 0xfffe @@ -335,11 +319,9 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX12-CU-LABEL: private_volatile_load_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff ; GFX12-CU-NEXT: s_wait_alu 0xfffe @@ -370,11 +352,10 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX6-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX6-NEXT: s_add_u32 s12, s12, s9 +; GFX6-NEXT: s_add_u32 s12, s12, s11 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 -; GFX6-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -386,11 +367,10 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX7-LABEL: private_volatile_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -402,12 +382,10 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX10-WGP-LABEL: private_volatile_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -419,12 +397,10 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX10-CU-LABEL: private_volatile_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -440,11 +416,10 @@ define amdgpu_kernel void @private_volatile_store_0( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -456,9 +431,8 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX11-WGP-LABEL: private_volatile_store_0: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -469,9 +443,8 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX11-CU-LABEL: private_volatile_store_0: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -482,10 +455,8 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX12-WGP-LABEL: private_volatile_store_0: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 @@ -501,10 +472,8 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX12-CU-LABEL: private_volatile_store_0: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 @@ -531,11 +500,10 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX6-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX6-NEXT: s_add_u32 s12, s12, s9 +; GFX6-NEXT: s_add_u32 s12, s12, s11 ; GFX6-NEXT: s_addc_u32 s13, s13, 0 -; GFX6-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s1, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0xb ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 2 @@ -549,11 +517,10 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX7-LABEL: private_volatile_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s13 +; GFX7-NEXT: s_add_u32 s0, s0, s15 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 2 @@ -567,11 +534,10 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX10-WGP-LABEL: private_volatile_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s13 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-WGP-NEXT: s_nop 0 -; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_mov_b32 s5, 2 @@ -584,11 +550,10 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX10-CU-LABEL: private_volatile_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s13 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GFX10-CU-NEXT: s_nop 0 -; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-CU-NEXT: s_mov_b32 s5, 2 @@ -605,11 +570,10 @@ define amdgpu_kernel void @private_volatile_store_1( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 -; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3] -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 @@ -623,8 +587,8 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX11-WGP-LABEL: private_volatile_store_1: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff @@ -639,8 +603,8 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX11-CU-LABEL: private_volatile_store_1: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff @@ -655,10 +619,8 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX12-WGP-LABEL: private_volatile_store_1: ; GFX12-WGP: ; %bb.0: ; %entry -; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-WGP-NEXT: s_wait_alu 0xfffe -; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff @@ -680,10 +642,8 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX12-CU-LABEL: private_volatile_store_1: ; GFX12-CU: ; %bb.0: ; %entry -; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX12-CU-NEXT: s_wait_alu 0xfffe -; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 1de9206801e2a8..15f31b4e86dbe5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -5,26 +5,26 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: vector_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 ; GCN-NEXT: s_endpgm ; ; GCN-SCRATCH-LABEL: vector_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x3 @@ -69,7 +69,7 @@ bb: define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: scalar_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -98,7 +98,7 @@ define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture read ; ; GCN-SCRATCH-LABEL: scalar_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -250,8 +250,8 @@ bb: define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture readnone %arg1, ptr addrspace(1) noalias nocapture %arg2) { ; GCN-LABEL: vector_clause_indirect: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[0:1] @@ -260,16 +260,17 @@ define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocap ; GCN-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16 +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 ; GCN-NEXT: s_endpgm ; ; GCN-SCRATCH-LABEL: vector_clause_indirect: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_clause 0x1 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[0:1] @@ -385,25 +386,25 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s18, -1 ; GCN-NEXT: s_mov_b32 s19, 0xe00000 -; GCN-NEXT: s_add_u32 s16, s16, s9 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GCN-NEXT: s_add_u32 s16, s16, s11 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_brev_b32 s0, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_mov_b32 s1, s0 ; GCN-NEXT: s_mov_b32 s2, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_load_dword v2, off, s[16:19], 0 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1 +; GCN-NEXT: image_sample v0, v[0:1], s[8:15], s[0:3] dmask:0x1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f32_e32 v0, v2, v0 ; GCN-NEXT: exp mrt0 v0, off, off, off done vm @@ -411,13 +412,13 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; ; GCN-SCRATCH-LABEL: flat_scratch_load: ; GCN-SCRATCH: ; %bb.0: ; %.entry -; GCN-SCRATCH-NEXT: s_add_u32 s6, s6, s11 -; GCN-SCRATCH-NEXT: s_addc_u32 s7, s7, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GCN-SCRATCH-NEXT: s_add_u32 s8, s8, s13 +; GCN-SCRATCH-NEXT: s_addc_u32 s9, s9, 0 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GCN-SCRATCH-NEXT: s_clause 0x1 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x24 -; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x44 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x44 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1 ; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8 @@ -457,7 +458,7 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s14, -1 ; GCN-NEXT: s_mov_b32 s15, 0xe00000 -; GCN-NEXT: s_add_u32 s12, s12, s9 +; GCN-NEXT: s_add_u32 s12, s12, s11 ; GCN-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-NEXT: buffer_store_dword v0, off, s[12:15], 0 @@ -476,10 +477,10 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32 ; ; GCN-SCRATCH-LABEL: flat_scratch_load_clause: ; GCN-SCRATCH: ; %bb.0: ; %.entry -; GCN-SCRATCH-NEXT: s_add_u32 s6, s6, s11 -; GCN-SCRATCH-NEXT: s_addc_u32 s7, s7, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GCN-SCRATCH-NEXT: s_add_u32 s8, s8, s13 +; GCN-SCRATCH-NEXT: s_addc_u32 s9, s9, 0 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40d00000 ; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index ff39439a2db1af..af7f92798a9319 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -31,8 +31,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_sle_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -53,8 +53,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_sle_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -75,8 +75,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_sle_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -89,8 +89,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_sle_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -104,18 +104,18 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_sle_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_i32_e32 v1, v1, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid @@ -143,7 +143,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_imin_sle_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -154,7 +154,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_imin_sle_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -165,7 +165,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_imin_sle_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -175,7 +175,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_imin_sle_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -185,7 +185,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_sle_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -213,7 +213,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; CI-LABEL: s_test_imin_sle_v1i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -224,7 +224,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; VI-LABEL: s_test_imin_sle_v1i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -235,7 +235,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX9-LABEL: s_test_imin_sle_v1i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -245,7 +245,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX10-LABEL: s_test_imin_sle_v1i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -255,7 +255,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_imin_sle_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -286,86 +286,86 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; CI-LABEL: s_test_imin_sle_v4i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 +; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_min_i32 s2, s11, s15 -; CI-NEXT: s_min_i32 s3, s10, s14 -; CI-NEXT: s_min_i32 s4, s9, s13 -; CI-NEXT: s_min_i32 s5, s8, s12 -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v0, s5 -; CI-NEXT: v_mov_b32_e32 v1, s4 -; CI-NEXT: v_mov_b32_e32 v2, s3 -; CI-NEXT: v_mov_b32_e32 v3, s2 -; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: s_min_i32 s3, s3, s7 +; CI-NEXT: s_min_i32 s2, s2, s6 +; CI-NEXT: s_min_i32 s1, s1, s5 +; CI-NEXT: s_min_i32 s0, s0, s4 +; CI-NEXT: v_mov_b32_e32 v4, s8 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_mov_b32_e32 v5, s9 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_test_imin_sle_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_min_i32 s2, s11, s15 -; VI-NEXT: s_min_i32 s3, s10, s14 -; VI-NEXT: s_min_i32 s4, s9, s13 -; VI-NEXT: s_min_i32 s5, s8, s12 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_min_i32 s3, s3, s7 +; VI-NEXT: s_min_i32 s2, s2, s6 +; VI-NEXT: s_min_i32 s1, s1, s5 +; VI-NEXT: s_min_i32 s0, s0, s4 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_imin_sle_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_min_i32 s2, s11, s15 -; GFX9-NEXT: s_min_i32 s3, s10, s14 -; GFX9-NEXT: s_min_i32 s4, s9, s13 -; GFX9-NEXT: s_min_i32 s5, s8, s12 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_min_i32 s3, s3, s7 +; GFX9-NEXT: s_min_i32 s2, s2, s6 +; GFX9-NEXT: s_min_i32 s1, s1, s5 +; GFX9-NEXT: s_min_i32 s0, s0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_sle_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_min_i32 s2, s11, s15 -; GFX10-NEXT: s_min_i32 s3, s10, s14 -; GFX10-NEXT: s_min_i32 s4, s8, s12 -; GFX10-NEXT: s_min_i32 s5, s9, s13 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: s_min_i32 s3, s3, s7 +; GFX10-NEXT: s_min_i32 s2, s2, s6 +; GFX10-NEXT: s_min_i32 s0, s0, s4 +; GFX10-NEXT: s_min_i32 s1, s1, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_sle_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s7, s11 -; GFX11-NEXT: s_min_i32 s3, s6, s10 -; GFX11-NEXT: s_min_i32 s4, s4, s8 -; GFX11-NEXT: s_min_i32 s5, s5, s9 +; GFX11-NEXT: s_min_i32 s2, s11, s15 +; GFX11-NEXT: s_min_i32 s3, s10, s14 +; GFX11-NEXT: s_min_i32 s4, s8, s12 +; GFX11-NEXT: s_min_i32 s5, s9, s13 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: v_mov_b32_e32 v2, s3 @@ -411,9 +411,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; CI-LABEL: s_test_imin_sle_i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0xa +; CI-NEXT: s_load_dword s3, s[8:9], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: s_sext_i32_i8 s3, s3 @@ -426,9 +426,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; VI-LABEL: s_test_imin_sle_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x28 +; VI-NEXT: s_load_dword s3, s[8:9], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 @@ -441,9 +441,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; GFX9-LABEL: s_test_imin_sle_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28 +; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -456,9 +456,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX10-LABEL: s_test_imin_sle_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28 +; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i8 s2, s2 @@ -471,13 +471,13 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX11-LABEL: s_test_imin_sle_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i8 s2, s4 -; GFX11-NEXT: s_sext_i32_i8 s3, s5 +; GFX11-NEXT: s_sext_i32_i8 s2, s2 +; GFX11-NEXT: s_sext_i32_i8 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -546,9 +546,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; CI-LABEL: s_test_imin_sle_v4i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 +; CI-NEXT: s_load_dword s2, s[8:9], 0xa +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_load_dword s3, s[8:9], 0x13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 24 ; CI-NEXT: s_sext_i32_i8 s5, s2 @@ -579,9 +579,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; VI-LABEL: s_test_imin_sle_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c +; VI-NEXT: s_load_dword s2, s[8:9], 0x28 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s3, s[8:9], 0x4c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 24 ; VI-NEXT: s_bfe_i32 s5, s2, 0x80010 @@ -612,9 +612,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX9-LABEL: s_test_imin_sle_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 @@ -648,9 +648,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX10-LABEL: s_test_imin_sle_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28 +; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s4, s2 ; GFX10-NEXT: s_sext_i32_i16 s7, s3 @@ -680,26 +680,26 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX11-LABEL: s_test_imin_sle_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x4c +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x28 +; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x4c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s4, s0 -; GFX11-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-NEXT: s_sext_i32_i16 s2, s0 +; GFX11-NEXT: s_lshr_b32 s3, s0, 16 ; GFX11-NEXT: s_sext_i32_i16 s7, s1 ; GFX11-NEXT: s_lshr_b32 s8, s1, 16 ; GFX11-NEXT: s_ashr_i32 s6, s0, 24 ; GFX11-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX11-NEXT: s_ashr_i32 s9, s1, 24 ; GFX11-NEXT: s_bfe_i32 s1, s1, 0x80000 -; GFX11-NEXT: s_lshr_b32 s4, s4, 8 -; GFX11-NEXT: s_bfe_i32 s5, s5, 0x80000 +; GFX11-NEXT: s_lshr_b32 s2, s2, 8 +; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80000 ; GFX11-NEXT: s_lshr_b32 s7, s7, 8 ; GFX11-NEXT: s_bfe_i32 s8, s8, 0x80000 ; GFX11-NEXT: v_min_i16 v0, s6, s9 ; GFX11-NEXT: v_min_i16 v1, s0, s1 -; GFX11-NEXT: v_min_i16 v2, s5, s8 -; GFX11-NEXT: v_min_i16 v3, s4, s7 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_min_i16 v2, s3, s8 +; GFX11-NEXT: v_min_i16 v3, s2, s7 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -756,7 +756,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; CI-LABEL: s_test_imin_sle_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 16 ; CI-NEXT: s_sext_i32_i16 s2, s2 @@ -775,7 +775,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; VI-LABEL: s_test_imin_sle_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 16 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -794,7 +794,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX9-LABEL: s_test_imin_sle_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -804,7 +804,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX10-LABEL: s_test_imin_sle_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, s2, s3 @@ -813,7 +813,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX11-LABEL: s_test_imin_sle_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, s2, s3 @@ -906,8 +906,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; CI-LABEL: s_test_imin_sle_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s6, s0, 16 ; CI-NEXT: s_ashr_i32 s7, s1, 16 @@ -936,8 +936,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; VI-LABEL: s_test_imin_sle_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s6, s1, 16 ; VI-NEXT: s_sext_i32_i16 s1, s1 @@ -966,8 +966,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; GFX9-LABEL: s_test_imin_sle_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -980,8 +980,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; GFX10-LABEL: s_test_imin_sle_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, s1, s3 @@ -992,13 +992,13 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; GFX11-LABEL: s_test_imin_sle_v4i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_min_i16 v1, s5, s7 -; GFX11-NEXT: v_pk_min_i16 v0, s4, s6 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_pk_min_i16 v1, s1, s3 +; GFX11-NEXT: v_pk_min_i16 v0, s0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm %cmp = icmp sle <4 x i16> %a, %b %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b @@ -1031,8 +1031,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_slt_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1053,8 +1053,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_slt_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1075,8 +1075,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_slt_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1089,8 +1089,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_slt_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -1104,18 +1104,18 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_slt_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_i32_e32 v1, v1, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid @@ -1170,8 +1170,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_slt_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1192,8 +1192,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_slt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1214,8 +1214,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_slt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1228,8 +1228,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_slt_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -1243,18 +1243,18 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_slt_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] -; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_i16 v1, v1, v2 -; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds i16, ptr addrspace(1) %aptr, i32 %tid @@ -1283,7 +1283,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_imin_slt_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1294,7 +1294,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_imin_slt_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_imin_slt_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -1315,7 +1315,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_imin_slt_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -1325,7 +1325,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_slt_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -1354,8 +1354,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; CI-LABEL: s_test_imin_slt_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s1, s1, s3 ; CI-NEXT: s_min_i32 s0, s0, s2 @@ -1368,8 +1368,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: s_test_imin_slt_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_min_i32 s0, s0, s2 @@ -1382,8 +1382,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; GFX9-LABEL: s_test_imin_slt_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s1, s1, s3 @@ -1396,8 +1396,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; GFX10-LABEL: s_test_imin_slt_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s0, s0, s2 @@ -1410,15 +1410,15 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; GFX11-LABEL: s_test_imin_slt_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s4, s6 -; GFX11-NEXT: s_min_i32 s3, s5, s7 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_min_i32 s0, s0, s2 +; GFX11-NEXT: s_min_i32 s1, s1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm %cmp = icmp slt <2 x i32> %a, %b %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b @@ -1440,8 +1440,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; CI-LABEL: s_test_imin_slt_imm_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1452,8 +1452,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; VI-LABEL: s_test_imin_slt_imm_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1464,8 +1464,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX9-LABEL: s_test_imin_slt_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, 8 @@ -1476,8 +1476,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX10-LABEL: s_test_imin_slt_imm_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, 8 @@ -1488,11 +1488,11 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-LABEL: s_test_imin_slt_imm_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s4, 8 +; GFX11-NEXT: s_min_i32 s2, s2, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1517,8 +1517,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; CI-LABEL: s_test_imin_sle_imm_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1529,8 +1529,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; VI-LABEL: s_test_imin_sle_imm_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1541,8 +1541,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX9-LABEL: s_test_imin_sle_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, 8 @@ -1553,8 +1553,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX10-LABEL: s_test_imin_sle_imm_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, 8 @@ -1565,11 +1565,11 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-LABEL: s_test_imin_sle_imm_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s4, 8 +; GFX11-NEXT: s_min_i32 s2, s2, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1605,8 +1605,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_umin_ule_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1627,8 +1627,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_umin_ule_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1649,8 +1649,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umin_ule_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1663,8 +1663,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_umin_ule_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -1678,18 +1678,18 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_umin_ule_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid @@ -1734,8 +1734,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_umin_ule_v3i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1758,8 +1758,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_umin_ule_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1782,8 +1782,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umin_ule_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] @@ -1798,8 +1798,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_umin_ule_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -1815,20 +1815,20 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_umin_ule_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b96 v[0:2], v6, s[6:7] -; GFX11-NEXT: global_load_b96 v[3:5], v6, s[0:1] +; GFX11-NEXT: global_load_b96 v[0:2], v6, s[2:3] +; GFX11-NEXT: global_load_b96 v[3:5], v6, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_u32_e32 v2, v2, v5 ; GFX11-NEXT: v_min_u32_e32 v1, v1, v4 ; GFX11-NEXT: v_min_u32_e32 v0, v0, v3 -; GFX11-NEXT: global_store_b96 v6, v[0:2], s[4:5] +; GFX11-NEXT: global_store_b96 v6, v[0:2], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a.ptr, i32 %tid @@ -1895,8 +1895,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_umin_ule_v3i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1931,8 +1931,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_umin_ule_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1959,8 +1959,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umin_ule_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] @@ -1975,8 +1975,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_umin_ule_v3i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -1992,21 +1992,21 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_umin_ule_v3i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] -; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX11-NEXT: v_pk_min_u16 v0, v0, v2 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b16 v4, v1, s[4:5] offset:4 -; GFX11-NEXT: global_store_b32 v4, v0, s[4:5] +; GFX11-NEXT: global_store_b16 v4, v1, s[0:1] offset:4 +; GFX11-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %a.ptr, i32 %tid @@ -2035,7 +2035,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_umin_ule_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2046,7 +2046,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_umin_ule_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2057,7 +2057,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_umin_ule_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2067,7 +2067,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_umin_ule_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2077,7 +2077,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ule_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2116,8 +2116,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_umin_ult_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2138,8 +2138,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_umin_ult_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2160,8 +2160,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umin_ult_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2174,8 +2174,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_umin_ult_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -2189,18 +2189,18 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_umin_ult_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid @@ -2246,8 +2246,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; CI-LABEL: v_test_umin_ult_i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 @@ -2267,8 +2267,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: v_test_umin_ult_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0 @@ -2288,8 +2288,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_test_umin_ult_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-NEXT: global_load_ubyte v2, v0, s[4:5] @@ -2301,8 +2301,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX10-LABEL: v_test_umin_ult_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -2315,16 +2315,16 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX11-LABEL: v_test_umin_ult_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_u8 v1, v0, s[6:7] -; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] +; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX11-NEXT: global_load_u8 v2, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_min_u16 v1, v1, v2 -; GFX11-NEXT: global_store_b8 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds i8, ptr addrspace(1) %a.ptr, i32 %tid @@ -2353,7 +2353,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_umin_ult_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2364,7 +2364,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_umin_ult_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2375,7 +2375,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_umin_ult_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2385,7 +2385,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_umin_ult_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2395,7 +2395,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ult_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2445,7 +2445,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; CI-LABEL: v_test_umin_ult_i32_multi_use: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2466,7 +2466,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; VI-LABEL: v_test_umin_ult_i32_multi_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2487,43 +2487,43 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i32_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[12:13], 0x0 -; GFX9-NEXT: s_load_dword s3, s[14:15], 0x0 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lt_u32 s2, s3 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_dword v0, v2, s[8:9] -; GFX9-NEXT: global_store_byte v0, v1, s[10:11] +; GFX9-NEXT: s_cmp_lt_u32 s8, s9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s8, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_umin_ult_i32_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[12:13], 0x0 -; GFX10-NEXT: s_load_dword s1, s[14:15], 0x0 +; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s9, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lt_u32 s0, s1 -; GFX10-NEXT: s_cselect_b32 s2, -1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX10-NEXT: s_and_b32 s2, s2, exec_lo -; GFX10-NEXT: s_cselect_b32 s0, s0, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: global_store_dword v1, v2, s[8:9] -; GFX10-NEXT: global_store_byte v1, v0, s[10:11] +; GFX10-NEXT: s_cmp_lt_u32 s8, s9 +; GFX10-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s4, s8, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: global_store_dword v1, v2, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_umin_ult_i32_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 @@ -2593,7 +2593,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; CI-LABEL: v_test_umin_ult_i16_multi_use: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 @@ -2615,7 +2615,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; VI-LABEL: v_test_umin_ult_i16_multi_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2637,38 +2637,38 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i16_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[12:13] -; GFX9-NEXT: global_load_ushort v2, v0, s[14:15] +; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: global_store_short v0, v1, s[8:9] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: global_store_byte v0, v1, s[10:11] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_umin_ult_i16_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ushort v1, v0, s[12:13] -; GFX10-NEXT: global_load_ushort v2, v0, s[14:15] +; GFX10-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX10-NEXT: global_store_short v0, v1, s[8:9] -; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v2, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_umin_ult_i16_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2705,7 +2705,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; CI-LABEL: s_test_umin_ult_v1i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2716,7 +2716,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; VI-LABEL: s_test_umin_ult_v1i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2727,7 +2727,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v1i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2737,7 +2737,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX10-LABEL: s_test_umin_ult_v1i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2747,7 +2747,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_umin_ult_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2786,17 +2786,17 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; CI-LABEL: s_test_umin_ult_v8i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_min_u32 s4, s11, s19 -; CI-NEXT: s_min_u32 s5, s10, s18 -; CI-NEXT: s_min_u32 s6, s9, s17 -; CI-NEXT: s_min_u32 s7, s8, s16 -; CI-NEXT: s_min_u32 s2, s15, s23 -; CI-NEXT: s_min_u32 s3, s14, s22 -; CI-NEXT: s_min_u32 s8, s13, s21 -; CI-NEXT: s_min_u32 s9, s12, s20 +; CI-NEXT: s_min_u32 s4, s15, s23 +; CI-NEXT: s_min_u32 s5, s14, s22 +; CI-NEXT: s_min_u32 s6, s13, s21 +; CI-NEXT: s_min_u32 s7, s12, s20 +; CI-NEXT: s_min_u32 s2, s19, s27 +; CI-NEXT: s_min_u32 s3, s18, s26 +; CI-NEXT: s_min_u32 s8, s17, s25 +; CI-NEXT: s_min_u32 s9, s16, s24 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: v_mov_b32_e32 v2, s3 @@ -2817,17 +2817,17 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; VI-LABEL: s_test_umin_ult_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_min_u32 s4, s11, s19 -; VI-NEXT: s_min_u32 s5, s10, s18 -; VI-NEXT: s_min_u32 s6, s9, s17 -; VI-NEXT: s_min_u32 s7, s8, s16 -; VI-NEXT: s_min_u32 s2, s15, s23 -; VI-NEXT: s_min_u32 s3, s14, s22 -; VI-NEXT: s_min_u32 s8, s13, s21 -; VI-NEXT: s_min_u32 s9, s12, s20 +; VI-NEXT: s_min_u32 s4, s15, s23 +; VI-NEXT: s_min_u32 s5, s14, s22 +; VI-NEXT: s_min_u32 s6, s13, s21 +; VI-NEXT: s_min_u32 s7, s12, s20 +; VI-NEXT: s_min_u32 s2, s19, s27 +; VI-NEXT: s_min_u32 s3, s18, s26 +; VI-NEXT: s_min_u32 s8, s17, s25 +; VI-NEXT: s_min_u32 s9, s16, s24 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -2848,18 +2848,18 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v8i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_min_u32 s4, s9, s17 -; GFX9-NEXT: s_min_u32 s5, s8, s16 -; GFX9-NEXT: s_min_u32 s6, s15, s23 -; GFX9-NEXT: s_min_u32 s7, s14, s22 -; GFX9-NEXT: s_min_u32 s8, s13, s21 -; GFX9-NEXT: s_min_u32 s9, s12, s20 -; GFX9-NEXT: s_min_u32 s2, s11, s19 -; GFX9-NEXT: s_min_u32 s3, s10, s18 +; GFX9-NEXT: s_min_u32 s6, s19, s27 +; GFX9-NEXT: s_min_u32 s7, s18, s26 +; GFX9-NEXT: s_min_u32 s8, s17, s25 +; GFX9-NEXT: s_min_u32 s9, s16, s24 +; GFX9-NEXT: s_min_u32 s2, s15, s23 +; GFX9-NEXT: s_min_u32 s3, s14, s22 +; GFX9-NEXT: s_min_u32 s4, s13, s21 +; GFX9-NEXT: s_min_u32 s5, s12, s20 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 @@ -2876,18 +2876,18 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX10-LABEL: s_test_umin_ult_v8i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_min_u32 s4, s9, s17 -; GFX10-NEXT: s_min_u32 s5, s8, s16 -; GFX10-NEXT: s_min_u32 s6, s15, s23 -; GFX10-NEXT: s_min_u32 s7, s14, s22 -; GFX10-NEXT: s_min_u32 s8, s12, s20 -; GFX10-NEXT: s_min_u32 s9, s13, s21 -; GFX10-NEXT: s_min_u32 s2, s11, s19 -; GFX10-NEXT: s_min_u32 s3, s10, s18 +; GFX10-NEXT: s_min_u32 s6, s19, s27 +; GFX10-NEXT: s_min_u32 s7, s18, s26 +; GFX10-NEXT: s_min_u32 s8, s16, s24 +; GFX10-NEXT: s_min_u32 s9, s17, s25 +; GFX10-NEXT: s_min_u32 s2, s15, s23 +; GFX10-NEXT: s_min_u32 s3, s14, s22 +; GFX10-NEXT: s_min_u32 s4, s13, s21 +; GFX10-NEXT: s_min_u32 s5, s12, s20 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s7 @@ -2903,24 +2903,24 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX11-LABEL: s_test_umin_ult_v8i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x20 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x20 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_u32 s2, s7, s15 -; GFX11-NEXT: s_min_u32 s3, s6, s14 -; GFX11-NEXT: s_min_u32 s6, s11, s19 -; GFX11-NEXT: s_min_u32 s7, s10, s18 -; GFX11-NEXT: s_min_u32 s8, s8, s16 -; GFX11-NEXT: s_min_u32 s9, s9, s17 -; GFX11-NEXT: s_min_u32 s5, s5, s13 -; GFX11-NEXT: s_min_u32 s4, s4, s12 +; GFX11-NEXT: s_min_u32 s4, s9, s17 +; GFX11-NEXT: s_min_u32 s5, s8, s16 +; GFX11-NEXT: s_min_u32 s6, s15, s23 +; GFX11-NEXT: s_min_u32 s7, s14, s22 +; GFX11-NEXT: s_min_u32 s8, s12, s20 +; GFX11-NEXT: s_min_u32 s9, s13, s21 +; GFX11-NEXT: s_min_u32 s2, s11, s19 +; GFX11-NEXT: s_min_u32 s3, s10, s18 ; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: v_mov_b32_e32 v2, s7 ; GFX11-NEXT: v_mov_b32_e32 v3, s6 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: v_mov_b32_e32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v4, s5 +; GFX11-NEXT: v_mov_b32_e32 v5, s4 ; GFX11-NEXT: v_mov_b32_e32 v6, s3 ; GFX11-NEXT: v_mov_b32_e32 v7, s2 ; GFX11-NEXT: s_clause 0x1 @@ -3075,138 +3075,138 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; CI-LABEL: s_test_umin_ult_v8i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 +; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s2, s8, 16 -; CI-NEXT: s_and_b32 s3, s8, 0xffff -; CI-NEXT: s_lshr_b32 s4, s9, 16 -; CI-NEXT: s_and_b32 s5, s9, 0xffff -; CI-NEXT: s_lshr_b32 s6, s10, 16 -; CI-NEXT: s_and_b32 s7, s10, 0xffff -; CI-NEXT: s_lshr_b32 s8, s11, 16 -; CI-NEXT: s_and_b32 s9, s11, 0xffff -; CI-NEXT: s_lshr_b32 s10, s12, 16 -; CI-NEXT: s_and_b32 s11, s12, 0xffff -; CI-NEXT: s_lshr_b32 s12, s13, 16 -; CI-NEXT: s_lshr_b32 s16, s14, 16 -; CI-NEXT: s_lshr_b32 s17, s15, 16 -; CI-NEXT: s_and_b32 s13, s13, 0xffff -; CI-NEXT: s_and_b32 s14, s14, 0xffff -; CI-NEXT: s_and_b32 s15, s15, 0xffff -; CI-NEXT: s_min_u32 s8, s8, s17 -; CI-NEXT: s_min_u32 s6, s6, s16 -; CI-NEXT: s_min_u32 s4, s4, s12 -; CI-NEXT: s_min_u32 s2, s2, s10 -; CI-NEXT: s_min_u32 s9, s9, s15 -; CI-NEXT: s_lshl_b32 s8, s8, 16 -; CI-NEXT: s_min_u32 s7, s7, s14 +; CI-NEXT: s_lshr_b32 s10, s0, 16 +; CI-NEXT: s_and_b32 s0, s0, 0xffff +; CI-NEXT: s_lshr_b32 s11, s1, 16 +; CI-NEXT: s_and_b32 s1, s1, 0xffff +; CI-NEXT: s_lshr_b32 s12, s2, 16 +; CI-NEXT: s_and_b32 s2, s2, 0xffff +; CI-NEXT: s_lshr_b32 s13, s3, 16 +; CI-NEXT: s_and_b32 s3, s3, 0xffff +; CI-NEXT: s_lshr_b32 s14, s4, 16 +; CI-NEXT: s_and_b32 s4, s4, 0xffff +; CI-NEXT: s_lshr_b32 s15, s5, 16 +; CI-NEXT: s_and_b32 s5, s5, 0xffff +; CI-NEXT: s_lshr_b32 s16, s6, 16 +; CI-NEXT: s_and_b32 s6, s6, 0xffff +; CI-NEXT: s_lshr_b32 s17, s7, 16 +; CI-NEXT: s_and_b32 s7, s7, 0xffff +; CI-NEXT: s_min_u32 s3, s3, s7 +; CI-NEXT: s_min_u32 s7, s13, s17 +; CI-NEXT: s_min_u32 s2, s2, s6 +; CI-NEXT: s_min_u32 s6, s12, s16 +; CI-NEXT: s_min_u32 s1, s1, s5 +; CI-NEXT: s_min_u32 s5, s11, s15 +; CI-NEXT: s_min_u32 s0, s0, s4 +; CI-NEXT: s_min_u32 s4, s10, s14 +; CI-NEXT: s_lshl_b32 s7, s7, 16 ; CI-NEXT: s_lshl_b32 s6, s6, 16 -; CI-NEXT: s_min_u32 s5, s5, s13 +; CI-NEXT: s_lshl_b32 s5, s5, 16 ; CI-NEXT: s_lshl_b32 s4, s4, 16 -; CI-NEXT: s_min_u32 s3, s3, s11 -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: s_or_b32 s8, s9, s8 -; CI-NEXT: s_or_b32 s6, s7, s6 -; CI-NEXT: s_or_b32 s4, s5, s4 -; CI-NEXT: s_or_b32 s2, s3, s2 -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s4 -; CI-NEXT: v_mov_b32_e32 v2, s6 -; CI-NEXT: v_mov_b32_e32 v3, s8 -; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: s_or_b32 s3, s3, s7 +; CI-NEXT: s_or_b32 s2, s2, s6 +; CI-NEXT: s_or_b32 s1, s1, s5 +; CI-NEXT: s_or_b32 s0, s0, s4 +; CI-NEXT: v_mov_b32_e32 v4, s8 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_mov_b32_e32 v5, s9 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_test_umin_ult_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s11, 16 -; VI-NEXT: s_lshr_b32 s4, s10, 16 -; VI-NEXT: s_and_b32 s5, s10, 0xffff -; VI-NEXT: s_lshr_b32 s10, s15, 16 -; VI-NEXT: s_and_b32 s3, s11, 0xffff -; VI-NEXT: s_and_b32 s11, s15, 0xffff -; VI-NEXT: s_lshr_b32 s15, s14, 16 -; VI-NEXT: s_min_u32 s2, s2, s10 -; VI-NEXT: s_lshr_b32 s6, s9, 16 -; VI-NEXT: s_and_b32 s7, s9, 0xffff -; VI-NEXT: s_lshr_b32 s9, s8, 16 -; VI-NEXT: s_and_b32 s14, s14, 0xffff -; VI-NEXT: s_lshr_b32 s16, s13, 16 -; VI-NEXT: s_lshr_b32 s17, s12, 16 -; VI-NEXT: s_min_u32 s4, s4, s15 -; VI-NEXT: s_min_u32 s3, s3, s11 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_and_b32 s13, s13, 0xffff -; VI-NEXT: s_and_b32 s12, s12, 0xffff -; VI-NEXT: s_min_u32 s9, s9, s17 -; VI-NEXT: s_min_u32 s6, s6, s16 -; VI-NEXT: s_min_u32 s5, s5, s14 -; VI-NEXT: s_or_b32 s2, s3, s2 -; VI-NEXT: s_lshl_b32 s3, s4, 16 -; VI-NEXT: s_min_u32 s8, s8, s12 -; VI-NEXT: s_min_u32 s7, s7, s13 -; VI-NEXT: s_or_b32 s3, s5, s3 -; VI-NEXT: s_lshl_b32 s4, s6, 16 -; VI-NEXT: s_lshl_b32 s5, s9, 16 -; VI-NEXT: s_or_b32 s4, s7, s4 -; VI-NEXT: s_or_b32 s5, s8, s5 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_lshr_b32 s10, s3, 16 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_lshr_b32 s11, s2, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshr_b32 s12, s1, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshr_b32 s13, s0, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshr_b32 s16, s5, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshr_b32 s17, s4, 16 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_min_u32 s0, s0, s4 +; VI-NEXT: s_min_u32 s4, s13, s17 +; VI-NEXT: s_min_u32 s1, s1, s5 +; VI-NEXT: s_min_u32 s5, s12, s16 +; VI-NEXT: s_min_u32 s2, s2, s6 +; VI-NEXT: s_min_u32 s6, s11, s15 +; VI-NEXT: s_min_u32 s3, s3, s7 +; VI-NEXT: s_min_u32 s7, s10, s14 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_or_b32 s3, s3, s7 +; VI-NEXT: s_or_b32 s2, s2, s6 +; VI-NEXT: s_or_b32 s1, s1, s5 +; VI-NEXT: s_or_b32 s0, s0, s4 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_umin_ult_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_pk_min_u16 v3, s11, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_pk_min_u16 v2, s10, v1 -; GFX9-NEXT: v_pk_min_u16 v1, s9, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_pk_min_u16 v0, s8, v0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_pk_min_u16 v3, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_pk_min_u16 v2, s2, v1 +; GFX9-NEXT: v_pk_min_u16 v1, s1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_pk_min_u16 v0, s0, v0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_umin_ult_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_min_u16 v3, s11, s15 -; GFX10-NEXT: v_pk_min_u16 v2, s10, s14 -; GFX10-NEXT: v_pk_min_u16 v1, s9, s13 -; GFX10-NEXT: v_pk_min_u16 v0, s8, s12 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: v_pk_min_u16 v3, s3, s7 +; GFX10-NEXT: v_pk_min_u16 v2, s2, s6 +; GFX10-NEXT: v_pk_min_u16 v1, s1, s5 +; GFX10-NEXT: v_pk_min_u16 v0, s0, s4 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_umin_ult_v8i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_min_u16 v3, s7, s11 -; GFX11-NEXT: v_pk_min_u16 v2, s6, s10 -; GFX11-NEXT: v_pk_min_u16 v1, s5, s9 -; GFX11-NEXT: v_pk_min_u16 v0, s4, s8 +; GFX11-NEXT: v_pk_min_u16 v3, s11, s15 +; GFX11-NEXT: v_pk_min_u16 v2, s10, s14 +; GFX11-NEXT: v_pk_min_u16 v1, s9, s13 +; GFX11-NEXT: v_pk_min_u16 v0, s8, s12 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %cmp = icmp ult <8 x i16> %a, %b @@ -3241,9 +3241,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; CI-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0xa +; CI-NEXT: s_load_dword s3, s[8:9], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3256,9 +3256,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; VI-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x28 +; VI-NEXT: s_load_dword s3, s[8:9], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3271,9 +3271,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; GFX9-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28 +; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -3286,9 +3286,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX10-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28 +; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff @@ -3301,13 +3301,13 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX11-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0xffff -; GFX11-NEXT: s_and_b32 s3, s5, 0xffff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_u32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -3348,9 +3348,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; CI-LABEL: simplify_demanded_bits_test_min_slt_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0xa +; CI-NEXT: s_load_dword s3, s[8:9], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 @@ -3363,9 +3363,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; VI-LABEL: simplify_demanded_bits_test_min_slt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x28 +; VI-NEXT: s_load_dword s3, s[8:9], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 @@ -3378,9 +3378,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; GFX9-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28 +; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -3393,9 +3393,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX10-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28 +; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s2 @@ -3408,13 +3408,13 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX11-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 -; GFX11-NEXT: s_sext_i32_i16 s3, s5 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 +; GFX11-NEXT: s_sext_i32_i16 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -3463,8 +3463,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; CI-LABEL: s_test_imin_sle_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s3, s2 ; CI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3477,8 +3477,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; VI-LABEL: s_test_imin_sle_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s3, s2 ; VI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3491,8 +3491,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; GFX9-LABEL: s_test_imin_sle_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s3, s2 @@ -3505,8 +3505,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX10-LABEL: s_test_imin_sle_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s3, s2 @@ -3519,14 +3519,14 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX11-LABEL: s_test_imin_sle_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 -; GFX11-NEXT: s_ashr_i32 s3, s4, 16 +; GFX11-NEXT: s_sext_i32_i16 s3, s2 +; GFX11-NEXT: s_ashr_i32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_i32 s2, s2, s3 +; GFX11-NEXT: s_min_i32 s2, s3, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -3557,8 +3557,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_umin_ult_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3575,8 +3575,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_umin_ult_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3593,8 +3593,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_umin_ult_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3611,8 +3611,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_umin_ult_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5] @@ -3627,18 +3627,18 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_umin_ult_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[6:7], s[0:1] +; GFX11-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s2, s2, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s6, s0 -; GFX11-NEXT: s_cselect_b32 s1, s7, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_and_b32 s6, s6, exec_lo +; GFX11-NEXT: s_cselect_b32 s2, s2, s4 +; GFX11-NEXT: s_cselect_b32 s3, s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tmp = icmp ult i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b @@ -3665,8 +3665,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_umin_ule_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3683,8 +3683,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_umin_ule_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3701,8 +3701,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_umin_ule_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3719,8 +3719,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_umin_ule_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5] @@ -3735,18 +3735,18 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_umin_ule_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u64_e64 s2, s[6:7], s[0:1] +; GFX11-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s2, s2, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s6, s0 -; GFX11-NEXT: s_cselect_b32 s1, s7, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_and_b32 s6, s6, exec_lo +; GFX11-NEXT: s_cselect_b32 s2, s2, s4 +; GFX11-NEXT: s_cselect_b32 s3, s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tmp = icmp ule i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b @@ -3773,8 +3773,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_imin_slt_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3791,8 +3791,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_imin_slt_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3809,8 +3809,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_imin_slt_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3827,8 +3827,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_imin_slt_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5] @@ -3843,18 +3843,18 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_imin_slt_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[0:1] +; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s2, s2, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s6, s0 -; GFX11-NEXT: s_cselect_b32 s1, s7, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_and_b32 s6, s6, exec_lo +; GFX11-NEXT: s_cselect_b32 s2, s2, s4 +; GFX11-NEXT: s_cselect_b32 s3, s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tmp = icmp slt i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b @@ -3881,8 +3881,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_imin_sle_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3899,8 +3899,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_imin_sle_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3917,8 +3917,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_imin_sle_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3935,8 +3935,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_imin_sle_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5] @@ -3951,18 +3951,18 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_imin_sle_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i64_e64 s2, s[6:7], s[0:1] +; GFX11-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s2, s2, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s6, s0 -; GFX11-NEXT: s_cselect_b32 s1, s7, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_and_b32 s6, s6, exec_lo +; GFX11-NEXT: s_cselect_b32 s2, s2, s4 +; GFX11-NEXT: s_cselect_b32 s3, s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tmp = icmp sle i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b @@ -4012,8 +4012,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_imin_sle_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4043,8 +4043,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_imin_sle_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4067,8 +4067,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imin_sle_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -4081,8 +4081,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_imin_sle_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -4096,18 +4096,18 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_imin_sle_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, v1, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid @@ -4162,8 +4162,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_imin_ule_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4192,8 +4192,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_imin_ule_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4216,8 +4216,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imin_ule_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -4230,8 +4230,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_imin_ule_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -4245,18 +4245,18 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_imin_ule_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_min_u16 v1, v1, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index 466505c0bcbea0..9e0b7daf38de16 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -827,7 +827,7 @@ define half @v_minimumnum_f16_v_s(half %x, half inreg %y) { ; GFX8-LABEL: v_minimumnum_f16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX8-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -835,7 +835,7 @@ define half @v_minimumnum_f16_v_s(half %x, half inreg %y) { ; GFX9-LABEL: v_minimumnum_f16_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -843,7 +843,7 @@ define half @v_minimumnum_f16_v_s(half %x, half inreg %y) { ; GFX10-LABEL: v_minimumnum_f16_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -877,24 +877,24 @@ define half @v_minimumnum_f16_s_s(half inreg %x, half inreg %y) { ; GFX8-LABEL: v_minimumnum_f16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e64 v0, s7, s7 -; GFX8-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX8-NEXT: v_max_f16_e64 v0, s17, s17 +; GFX8-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX8-NEXT: v_min_f16_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimumnum_f16_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v0, s7, s7 -; GFX9-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f16_e64 v0, s17, s17 +; GFX9-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX9-NEXT: v_min_f16_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimumnum_f16_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e64 v0, s7, s7 -; GFX10-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f16_e64 v0, s17, s17 +; GFX10-NEXT: v_max_f16_e64 v1, s16, s16 ; GFX10-NEXT: v_min_f16_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -928,7 +928,7 @@ define float @v_minimumnum_f32_s_v(float inreg %x, float %y) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; GFX8-NEXT: v_min_f32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -936,7 +936,7 @@ define float @v_minimumnum_f32_s_v(float inreg %x, float %y) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX9-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f32_e64 v1, s16, s16 ; GFX9-NEXT: v_min_f32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -944,7 +944,7 @@ define float @v_minimumnum_f32_s_v(float inreg %x, float %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f32_e64 v1, s16, s16 ; GFX10-NEXT: v_min_f32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -977,7 +977,7 @@ define float @v_minimumnum_f32_v_s(float %x, float inreg %y) { ; GFX8-LABEL: v_minimumnum_f32_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -985,7 +985,7 @@ define float @v_minimumnum_f32_v_s(float %x, float inreg %y) { ; GFX9-LABEL: v_minimumnum_f32_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f32_e64 v1, s16, s16 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -993,7 +993,7 @@ define float @v_minimumnum_f32_v_s(float %x, float inreg %y) { ; GFX10-LABEL: v_minimumnum_f32_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f32_e64 v1, s16, s16 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1027,24 +1027,24 @@ define float @v_minimumnum_f32_s_s(float inreg %x, float inreg %y) { ; GFX8-LABEL: v_minimumnum_f32_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, s7 -; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; GFX8-NEXT: v_min_f32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimumnum_f32_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e64 v0, s7, s7 -; GFX9-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f32_e64 v0, s17, s17 +; GFX9-NEXT: v_max_f32_e64 v1, s16, s16 ; GFX9-NEXT: v_min_f32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimumnum_f32_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v0, s7, s7 -; GFX10-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f32_e64 v0, s17, s17 +; GFX10-NEXT: v_max_f32_e64 v1, s16, s16 ; GFX10-NEXT: v_min_f32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1078,7 +1078,7 @@ define double @v_minimumnum_f64_s_v(double inreg %x, double %y) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1086,14 +1086,14 @@ define double @v_minimumnum_f64_s_v(double inreg %x, double %y) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimumnum_f64_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX10-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX10-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1127,7 +1127,7 @@ define double @v_minimumnum_f64_v_s(double %x, double inreg %y) { ; GFX8-LABEL: v_minimumnum_f64_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1135,7 +1135,7 @@ define double @v_minimumnum_f64_v_s(double %x, double inreg %y) { ; GFX9-LABEL: v_minimumnum_f64_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1143,7 +1143,7 @@ define double @v_minimumnum_f64_v_s(double %x, double inreg %y) { ; GFX10-LABEL: v_minimumnum_f64_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX10-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1177,24 +1177,24 @@ define double @v_minimumnum_f64_s_s(double inreg %x, double inreg %y) { ; GFX8-LABEL: v_minimumnum_f64_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], s[16:17], s[16:17] -; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX8-NEXT: v_max_f64 v[0:1], s[18:19], s[18:19] +; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimumnum_f64_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], s[16:17], s[16:17] -; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX9-NEXT: v_max_f64 v[0:1], s[18:19], s[18:19] +; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimumnum_f64_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[16:17] -; GFX10-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX10-NEXT: v_max_f64 v[0:1], s[18:19], s[18:19] +; GFX10-NEXT: v_max_f64 v[2:3], s[16:17], s[16:17] ; GFX10-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll index aa16937d7d897d..43e3a1fa294838 100644 --- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll +++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll @@ -81,24 +81,25 @@ define amdgpu_kernel void @withcall() { ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 -; GFX9-NEXT: s_add_u32 s20, s20, s9 +; GFX9-NEXT: s_add_u32 s20, s20, s11 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 36 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 36 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, nonkernel@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, nonkernel@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 ; GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: ds_write_b32 v3, v3 offset:8 @@ -112,24 +113,25 @@ define amdgpu_kernel void @withcall() { ; GFX10-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s22, -1 ; GFX10-NEXT: s_mov_b32 s23, 0x31c16000 -; GFX10-NEXT: s_add_u32 s20, s20, s9 +; GFX10-NEXT: s_add_u32 s20, s20, s11 ; GFX10-NEXT: s_addc_u32 s21, s21, 0 -; GFX10-NEXT: s_mov_b32 s14, s8 -; GFX10-NEXT: s_add_u32 s8, s2, 36 -; GFX10-NEXT: s_addc_u32 s9, s3, 0 -; GFX10-NEXT: s_getpc_b64 s[2:3] -; GFX10-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12 +; GFX10-NEXT: s_mov_b32 s12, s8 +; GFX10-NEXT: s_add_u32 s8, s4, 36 +; GFX10-NEXT: s_mov_b32 s13, s9 +; GFX10-NEXT: s_addc_u32 s9, s5, 0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, nonkernel@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, nonkernel@gotpcrel32@hi+12 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX10-NEXT: s_mov_b32 s14, s10 +; GFX10-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX10-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX10-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX10-NEXT: s_mov_b32 s12, s6 -; GFX10-NEXT: s_mov_b32 s13, s7 +; GFX10-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX10-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX10-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: ds_write_b32 v3, v3 offset:8 @@ -143,17 +145,20 @@ define amdgpu_kernel void @withcall() { ; G_GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; G_GFX9-NEXT: s_mov_b32 s22, -1 ; G_GFX9-NEXT: s_mov_b32 s23, 0xe00000 -; G_GFX9-NEXT: s_add_u32 s20, s20, s9 +; G_GFX9-NEXT: s_add_u32 s20, s20, s11 ; G_GFX9-NEXT: s_addc_u32 s21, s21, 0 -; G_GFX9-NEXT: s_mov_b32 s14, s8 -; G_GFX9-NEXT: s_add_u32 s8, s2, 36 -; G_GFX9-NEXT: s_addc_u32 s9, s3, 0 -; G_GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] -; G_GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; G_GFX9-NEXT: s_mov_b32 s16, s8 +; G_GFX9-NEXT: s_add_u32 s8, s4, 36 +; G_GFX9-NEXT: s_mov_b32 s15, s9 +; G_GFX9-NEXT: s_addc_u32 s9, s5, 0 +; G_GFX9-NEXT: s_mov_b64 s[12:13], s[0:1] ; G_GFX9-NEXT: s_getpc_b64 s[0:1] ; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 ; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 -; G_GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; G_GFX9-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0 +; G_GFX9-NEXT: s_mov_b32 s14, s10 +; G_GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; G_GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; G_GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; G_GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; G_GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] @@ -161,12 +166,13 @@ define amdgpu_kernel void @withcall() { ; G_GFX9-NEXT: v_mov_b32_e32 v4, 8 ; G_GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; G_GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] -; G_GFX9-NEXT: s_mov_b32 s12, s6 -; G_GFX9-NEXT: s_mov_b32 s13, s7 +; G_GFX9-NEXT: s_mov_b64 s[4:5], s[12:13] +; G_GFX9-NEXT: s_mov_b32 s12, s16 +; G_GFX9-NEXT: s_mov_b32 s13, s15 ; G_GFX9-NEXT: s_mov_b32 s32, 0 ; G_GFX9-NEXT: ds_write_b32 v4, v3 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; G_GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; G_GFX9-NEXT: s_endpgm ; ; G_GFX10-LABEL: withcall: @@ -175,30 +181,34 @@ define amdgpu_kernel void @withcall() { ; G_GFX10-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; G_GFX10-NEXT: s_mov_b32 s22, -1 ; G_GFX10-NEXT: s_mov_b32 s23, 0x31c16000 -; G_GFX10-NEXT: s_add_u32 s20, s20, s9 +; G_GFX10-NEXT: s_add_u32 s20, s20, s11 ; G_GFX10-NEXT: s_addc_u32 s21, s21, 0 -; G_GFX10-NEXT: s_mov_b32 s14, s8 -; G_GFX10-NEXT: s_add_u32 s8, s2, 36 -; G_GFX10-NEXT: s_addc_u32 s9, s3, 0 -; G_GFX10-NEXT: s_mov_b64 s[10:11], s[4:5] -; G_GFX10-NEXT: s_mov_b64 s[4:5], s[0:1] +; G_GFX10-NEXT: s_mov_b32 s16, s8 +; G_GFX10-NEXT: s_add_u32 s8, s4, 36 +; G_GFX10-NEXT: s_mov_b32 s15, s9 +; G_GFX10-NEXT: s_addc_u32 s9, s5, 0 +; G_GFX10-NEXT: s_mov_b64 s[12:13], s[0:1] ; G_GFX10-NEXT: s_getpc_b64 s[0:1] ; G_GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 ; G_GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 ; G_GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; G_GFX10-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; G_GFX10-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0 ; G_GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; G_GFX10-NEXT: v_mov_b32_e32 v3, 0 ; G_GFX10-NEXT: v_mov_b32_e32 v4, 8 +; G_GFX10-NEXT: s_mov_b32 s14, s10 +; G_GFX10-NEXT: s_mov_b64 s[10:11], s[6:7] +; G_GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 +; G_GFX10-NEXT: s_mov_b64 s[6:7], s[2:3] ; G_GFX10-NEXT: s_mov_b64 s[0:1], s[20:21] ; G_GFX10-NEXT: s_mov_b64 s[2:3], s[22:23] -; G_GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 -; G_GFX10-NEXT: s_mov_b32 s12, s6 -; G_GFX10-NEXT: s_mov_b32 s13, s7 +; G_GFX10-NEXT: s_mov_b64 s[4:5], s[12:13] +; G_GFX10-NEXT: s_mov_b32 s12, s16 +; G_GFX10-NEXT: s_mov_b32 s13, s15 ; G_GFX10-NEXT: s_mov_b32 s32, 0 ; G_GFX10-NEXT: ds_write_b32 v4, v3 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; G_GFX10-NEXT: s_swappc_b64 s[30:31], s[18:19] ; G_GFX10-NEXT: s_endpgm store i32 0, ptr addrspace(3) @used_by_both call void @nonkernel() diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll index d4c66f00ffde8d..4e89a168372336 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: add_reg_imm ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -27,9 +27,9 @@ define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) { define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: add_reg_reg ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -50,9 +50,9 @@ define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sub_reg_imm ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -73,9 +73,9 @@ define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sub_imm_reg ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -96,9 +96,9 @@ define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sub_reg_reg(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sub_reg_reg ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll index eb638da3904055..fab5d386446d3c 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll @@ -13,28 +13,28 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 ; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400 -; GCN-NEXT: s_load_dword s2, s[2:3], 0xf -; GCN-NEXT: s_mov_b64 s[0:1], 0 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xf +; GCN-NEXT: s_mov_b64 s[2:3], 0 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_max_i32_e32 v3, s2, v4 +; GCN-NEXT: v_max_i32_e32 v3, s4, v4 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v6, v4 ; GCN-NEXT: v_mov_b32_e32 v5, v3 @@ -42,15 +42,15 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GCN-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN-NEXT: v_mov_b32_e32 v4, v5 -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN-NEXT: s_cbranch_execnz .LBB0_2 ; GCN-NEXT: ; %bb.3: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: buffer_store_dword v5, off, s[4:7], 0 +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], 0 ; GCN-NEXT: .LBB0_4: ; %exit ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -73,37 +73,37 @@ exit: define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32_noret: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[4:7], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_3 ; GCN-NEXT: ; %bb.1: ; %atomic -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_load_dword v4, v[1:2], s[4:7], 0 addr64 offset:400 -; GCN-NEXT: s_load_dword s2, s[2:3], 0xf -; GCN-NEXT: s_mov_b64 s[0:1], 0 +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 +; GCN-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:400 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xf +; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: .LBB1_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_max_i32_e32 v3, s2, v4 +; GCN-NEXT: v_max_i32_e32 v3, s6, v4 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v6, v4 ; GCN-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NEXT: buffer_atomic_cmpswap v[5:6], v[1:2], s[4:7], 0 addr64 offset:400 glc +; GCN-NEXT: buffer_atomic_cmpswap v[5:6], v[1:2], s[0:3], 0 addr64 offset:400 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GCN-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v4, v5 -; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-NEXT: s_cbranch_execnz .LBB1_2 ; GCN-NEXT: .LBB1_3: ; %exit ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll index 90a3d350e7416e..7eb44636f79d76 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll @@ -13,29 +13,29 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %atomic -; GCN-NEXT: s_load_dword s0, s[2:3], 0xf +; GCN-NEXT: s_load_dword s2, s[4:5], 0xf ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, s11 +; GCN-NEXT: s_mov_b32 s3, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: buffer_atomic_smax v0, v[1:2], s[8:11], 0 addr64 offset:400 glc +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: .LBB0_2: ; %exit ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -58,24 +58,24 @@ exit: define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32_noret: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[4:7], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %atomic -; GCN-NEXT: s_load_dword s0, s[2:3], 0xf -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xf +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: buffer_atomic_smax v0, v[1:2], s[4:7], 0 addr64 offset:400 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_atomic_smax v0, v[1:2], s[0:3], 0 addr64 offset:400 ; GCN-NEXT: .LBB1_2: ; %exit ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll index b696f097d05b70..6dbfebfd9b9d0d 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll @@ -8,17 +8,17 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] -; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -40,23 +40,23 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX9-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v2, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: ctlz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 -; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 -; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] -; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2 -; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 +; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] +; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(5) @@ -76,7 +76,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone @@ -87,17 +87,17 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] -; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -120,23 +120,23 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v2, v0 ; GFX9-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 -; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 -; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] -; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2 -; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 +; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] +; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(5) @@ -157,7 +157,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v2, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone @@ -168,17 +168,17 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] -; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -200,23 +200,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX9-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cttz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7 -; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6 -; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 -; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] -; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7 +; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] +; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(6) @@ -238,7 +238,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v0, v2 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -249,17 +249,17 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] -; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] +; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -282,23 +282,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7 -; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6 -; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 -; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] -; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7 +; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] +; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(6) @@ -321,7 +321,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll index 57f7ceb964d857..9407c8a9ee436d 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: exp_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -24,9 +24,9 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: exp_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -45,9 +45,9 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: log_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -65,9 +65,9 @@ define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: log_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -86,9 +86,9 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rcp_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -106,9 +106,9 @@ define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rcp_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -127,9 +127,9 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rsq_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -147,9 +147,9 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rsq_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -168,9 +168,9 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sqrt_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -188,9 +188,9 @@ define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sqrt_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 03de142a41b4c7..bb7a591c914654 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_mul_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -31,7 +31,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_mul_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -50,45 +50,45 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_mul_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: test_mul_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_mul_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -107,7 +107,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_mul_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -153,7 +153,7 @@ entry: define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -175,7 +175,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_mul_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -197,52 +197,52 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_mul_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v7 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v6 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v3, v3, v7 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v6 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, v5 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4 -; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_v4i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -265,7 +265,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_mul_v4i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -318,87 +318,88 @@ entry: define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: s_trunc_i64_mul_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_load_dword s3, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s7, s6 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mul_i32 s0, s3, s2 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_trunc_i64_mul_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_load_dword s3, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s4, s7, s6 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mul_i32 s0, s3, s2 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_trunc_i64_mul_to_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s7, s[2:3], 0x34 -; GFX9-NEXT: ; kill: killed $sgpr2_sgpr3 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x34 +; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s7, s6 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mul_i32 s0, s3, s2 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_trunc_i64_mul_to_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mul_i32 s0, s0, s6 -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x34 +; GFX10-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mul_i32 s2, s3, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_trunc_i64_mul_to_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mul_i32 s0, s0, s6 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mul_i32 s2, s3, s2 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_trunc_i64_mul_to_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mul_i32 s0, s0, s6 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-NEXT: s_load_b32 s3, s[4:5], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mul_i32 s2, s3, s2 +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: s_trunc_i64_mul_to_i32: @@ -421,111 +422,111 @@ entry: define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_trunc_i64_mul_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_lo_u32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_trunc_i64_mul_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_trunc_i64_mul_to_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s14, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_trunc_i64_mul_to_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s6 +; GFX10-NEXT: s_mov_b32 s15, s7 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s12, s6 -; GFX10-NEXT: s_mov_b32 s13, s7 +; GFX10-NEXT: s_mov_b32 s12, s2 +; GFX10-NEXT: s_mov_b32 s13, s3 ; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_trunc_i64_mul_to_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 @@ -534,21 +535,21 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX12-LABEL: v_trunc_i64_mul_to_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 ; GFX12-NEXT: s_mov_b32 s14, s10 ; GFX12-NEXT: s_mov_b32 s15, s11 -; GFX12-NEXT: s_mov_b32 s2, s10 -; GFX12-NEXT: s_mov_b32 s3, s11 +; GFX12-NEXT: s_mov_b32 s6, s10 +; GFX12-NEXT: s_mov_b32 s7, s11 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s6 -; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 ; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null -; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null -; GFX12-NEXT: s_mov_b32 s8, s4 -; GFX12-NEXT: s_mov_b32 s9, s5 +; GFX12-NEXT: buffer_load_b32 v1, off, s[4:7], null +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null @@ -587,25 +588,25 @@ entry: define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: mul64_sext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x50 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mul_hi_i32 v1, s4, v0 -; SI-NEXT: s_mulk_i32 s4, 0x50 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mul_hi_i32 v1, s6, v0 +; SI-NEXT: s_mulk_i32 s6, 0x50 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: mul64_sext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x50 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s4, v0, 0 +; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_nop 2 @@ -614,28 +615,28 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s5, s4, 0x50 -; GFX9-NEXT: s_mulk_i32 s4, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_mul_hi_i32 s4, s6, 0x50 +; GFX9-NEXT: s_mulk_i32 s6, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_sext_c: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX10-NEXT: s_mul_hi_i32 s3, s4, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mul_i32 s3, s2, 0x50 +; GFX10-NEXT: s_mul_hi_i32 s2, s2, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -644,13 +645,13 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: mul64_sext_c: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX11-NEXT: s_mul_hi_i32 s3, s4, 0x50 +; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 +; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -658,7 +659,7 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s3, s2, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -691,25 +692,25 @@ entry: define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: mul64_zext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x50 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mul_hi_u32 v1, s4, v0 -; SI-NEXT: s_mulk_i32 s4, 0x50 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mul_hi_u32 v1, s6, v0 +; SI-NEXT: s_mulk_i32 s6, 0x50 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: mul64_zext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x50 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_nop 2 @@ -718,28 +719,28 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s5, s4, 0x50 -; GFX9-NEXT: s_mulk_i32 s4, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s6, 0x50 +; GFX9-NEXT: s_mulk_i32 s6, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_zext_c: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX10-NEXT: s_mul_hi_u32 s3, s4, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mul_i32 s3, s2, 0x50 +; GFX10-NEXT: s_mul_hi_u32 s2, s2, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -748,13 +749,13 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: mul64_zext_c: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX11-NEXT: s_mul_hi_u32 s3, s4, 0x50 +; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 +; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -762,7 +763,7 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 @@ -794,7 +795,7 @@ entry: define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -814,7 +815,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_mul64_sext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -833,46 +834,46 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_movk_i32 s0, 0x50 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_movk_i32 s2, 0x50 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul64_sext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_hi_i32 v1, 0x50, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul64_sext_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -891,7 +892,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: v_mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -937,7 +938,7 @@ entry: define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_zext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -957,7 +958,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_mul64_zext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -976,46 +977,46 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_movk_i32 s0, 0x50 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_movk_i32 s2, 0x50 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_hi_u32 v1, v0, s0 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, s2 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul64_zext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_hi_u32 v1, 0x50, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul64_zext_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1034,7 +1035,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: v_mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1080,7 +1081,7 @@ entry: define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_inline_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1099,7 +1100,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_mul64_sext_inline_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1117,45 +1118,45 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_mul64_sext_inline_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_hi_i32 v1, v0, 9 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 9 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul64_sext_inline_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_hi_i32 v1, v0, 9 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 9 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul64_sext_inline_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1174,7 +1175,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_mul64_sext_inline_imm: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1220,39 +1221,39 @@ entry: define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: s_mul_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dword s7, s[4:5], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_mul_i32 s4, s6, s7 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_mul_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dword s5, s[2:3], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c +; VI-NEXT: s_load_dword s7, s[4:5], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_mul_i32 s4, s6, s7 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x4c +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_i32 s4, s6, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -1260,12 +1261,12 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX10-LABEL: s_mul_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x4c +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s4, s5 +; GFX10-NEXT: s_mul_i32 s2, s2, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1274,13 +1275,12 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX11-LABEL: s_mul_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s2, s2, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -1289,13 +1289,12 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX12-LABEL: s_mul_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x4c +; GFX12-NEXT: s_load_b32 s3, s[4:5], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mul_i32 s2, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s2, s2, s3 +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1320,7 +1319,7 @@ entry: define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1338,7 +1337,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: v_mul_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1356,43 +1355,43 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1410,7 +1409,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: v_mul_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1454,42 +1453,42 @@ entry: define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind { ; SI-LABEL: s_mul_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dword s7, s[4:5], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 -; SI-NEXT: s_and_b32 s4, s4, 1 +; SI-NEXT: s_mul_i32 s6, s6, s7 +; SI-NEXT: s_and_b32 s4, s6, 1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dword s5, s[2:3], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c +; VI-NEXT: s_load_dword s7, s[4:5], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: s_and_b32 s4, s4, 1 +; VI-NEXT: s_mul_i32 s6, s6, s7 +; VI-NEXT: s_and_b32 s4, s6, 1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x4c +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s4, s6, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -1497,13 +1496,13 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX10-LABEL: s_mul_i1: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x4c +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s4, s4, s5 -; GFX10-NEXT: s_and_b32 s2, s4, 1 +; GFX10-NEXT: s_mul_i32 s2, s2, s3 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_and_b32 s2, s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -1512,14 +1511,14 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX11-LABEL: s_mul_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s4, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s4, 1 +; GFX11-NEXT: s_mul_i32 s2, s2, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_and_b32 s2, s2, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -1528,14 +1527,14 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX12-LABEL: s_mul_i1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x4c +; GFX12-NEXT: s_load_b32 s3, s[4:5], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mul_i32 s4, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s2, s4, 1 +; GFX12-NEXT: s_mul_i32 s2, s2, s3 +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: s_and_b32 s2, s2, 1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -1577,7 +1576,7 @@ entry: define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1597,7 +1596,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: v_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1617,48 +1616,48 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX9-LABEL: v_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s2 -; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_mov_b32 s9, s7 +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1680,7 +1679,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX12-LABEL: v_mul_i1: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1746,115 +1745,115 @@ entry: define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: s_mul_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mul_hi_u32 v0, s6, v0 -; SI-NEXT: s_mul_i32 s4, s6, s9 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; SI-NEXT: s_mul_i32 s4, s7, s8 -; SI-NEXT: v_add_i32_e32 v1, vcc, s4, v0 -; SI-NEXT: s_mul_i32 s4, s6, s8 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mul_hi_u32 v0, s2, v0 +; SI-NEXT: s_mul_i32 s0, s2, s9 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: s_mul_i32 s0, s3, s8 +; SI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; SI-NEXT: s_mul_i32 s0, s2, s8 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_mul_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s6, v0, 0 -; VI-NEXT: s_mul_i32 s4, s6, s9 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 -; VI-NEXT: s_mul_i32 s4, s7, s8 -; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s2, v0, 0 +; VI-NEXT: s_mul_i32 s0, s2, s9 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: s_mul_i32 s0, s3, s8 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mul_i32 s4, s6, s9 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s8 -; GFX9-NEXT: s_add_i32 s4, s5, s4 -; GFX9-NEXT: s_mul_i32 s5, s7, s8 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_i32 s5, s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mul_i32 s0, s2, s9 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s8 +; GFX9-NEXT: s_add_i32 s0, s1, s0 +; GFX9-NEXT: s_mul_i32 s1, s3, s8 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_i32 s1, s2, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s1, s6, s1 -; GFX10-NEXT: s_mul_hi_u32 s2, s6, s0 -; GFX10-NEXT: s_add_i32 s1, s2, s1 -; GFX10-NEXT: s_mul_i32 s2, s7, s0 -; GFX10-NEXT: s_mul_i32 s0, s6, s0 -; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_mul_i32 s4, s2, s7 +; GFX10-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX10-NEXT: s_mul_i32 s3, s3, s6 +; GFX10-NEXT: s_add_i32 s4, s5, s4 +; GFX10-NEXT: s_mul_i32 s2, s2, s6 +; GFX10-NEXT: s_add_i32 s4, s4, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s1, s6, s1 -; GFX11-NEXT: s_mul_hi_u32 s2, s6, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s1, s2, s1 -; GFX11-NEXT: s_mul_i32 s2, s7, s0 -; GFX11-NEXT: s_mul_i32 s0, s6, s0 -; GFX11-NEXT: s_add_i32 s1, s1, s2 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s0, s4 -; GFX11-NEXT: s_mov_b32 s1, s5 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_mul_i32 s5, s2, s5 +; GFX11-NEXT: s_mul_hi_u32 s6, s2, s4 +; GFX11-NEXT: s_mul_i32 s3, s3, s4 +; GFX11-NEXT: s_add_i32 s5, s6, s5 +; GFX11-NEXT: s_mul_i32 s2, s2, s4 +; GFX11-NEXT: s_add_i32 s5, s5, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1] -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5] +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX12-NEXT: s_endpgm ; ; EG-LABEL: s_mul_i64: @@ -1881,21 +1880,21 @@ entry: define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; SI-LABEL: v_mul_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_lo_u32 v1, v2, v1 ; SI-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -1903,52 +1902,52 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; SI-NEXT: v_mul_lo_u32 v0, v2, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_mul_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v4, v2, v1 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v2, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[2:3], v2, v0, 0 ; VI-NEXT: v_mul_lo_u32 v0, v3, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s12, s2 +; GFX9-NEXT: s_mov_b32 s13, s3 +; GFX9-NEXT: s_mov_b32 s14, s6 +; GFX9-NEXT: s_mov_b32 s15, s7 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -1956,27 +1955,27 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_mov_b32 s14, s6 +; GFX10-NEXT: s_mov_b32 s15, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s12, s6 -; GFX10-NEXT: s_mov_b32 s13, s7 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_mov_b32 s12, s2 +; GFX10-NEXT: s_mov_b32 s13, s3 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -1984,27 +1983,27 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -2019,21 +2018,21 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX12-LABEL: v_mul_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 -; GFX12-NEXT: s_mov_b32 s2, s10 -; GFX12-NEXT: s_mov_b32 s3, s11 +; GFX12-NEXT: s_mov_b32 s6, s10 +; GFX12-NEXT: s_mov_b32 s7, s11 ; GFX12-NEXT: s_mov_b32 s14, s10 ; GFX12-NEXT: s_mov_b32 s15, s11 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s6 -; GFX12-NEXT: s_mov_b32 s13, s7 -; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null ; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null -; GFX12-NEXT: s_mov_b32 s8, s4 -; GFX12-NEXT: s_mov_b32 s9, s5 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3 ; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2 @@ -2079,20 +2078,20 @@ entry: define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) { ; SI-LABEL: mul32_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mul_i32 s6, s0, s1 -; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mul_i32 s8, s0, s1 +; SI-NEXT: s_mov_b64 s[6:7], 0 ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_mov_b64 s[6:7], -1 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: .LBB15_3: ; %Flow -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccnz .LBB15_5 @@ -2104,7 +2103,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_branch .LBB15_6 ; SI-NEXT: .LBB15_5: -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: .LBB15_6: ; %endif ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -2114,20 +2113,20 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: mul32_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mul_i32 s6, s0, s1 -; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_mul_i32 s8, s0, s1 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: -; VI-NEXT: s_mov_b64 s[4:5], -1 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_mov_b64 s[6:7], -1 +; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: .LBB15_3: ; %Flow -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %if ; VI-NEXT: s_mov_b32 s7, 0xf000 @@ -2138,7 +2137,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_branch .LBB15_6 ; VI-NEXT: .LBB15_5: -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: .LBB15_6: ; %endif ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -2149,90 +2148,90 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul32_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9-NEXT: ; %bb.1: ; %else ; GFX9-NEXT: s_mul_i32 s8, s0, s1 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: -; GFX9-NEXT: s_mov_b64 s[0:1], -1 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: ; implicit-def: $sgpr8 ; GFX9-NEXT: .LBB15_3: ; %Flow -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX9-NEXT: ; %bb.4: ; %if -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s6 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_branch .LBB15_6 ; GFX9-NEXT: .LBB15_5: ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: .LBB15_6: ; %endif ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul32_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_mul_i32 s0, s0, s1 +; GFX10-NEXT: s_mul_i32 s7, s0, s1 ; GFX10-NEXT: s_branch .LBB15_3 ; GFX10-NEXT: .LBB15_2: -; GFX10-NEXT: s_mov_b32 s8, -1 -; GFX10-NEXT: ; implicit-def: $sgpr0 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: ; implicit-def: $sgpr7 ; GFX10-NEXT: .LBB15_3: ; %Flow -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX10-NEXT: ; %bb.4: ; %if -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s6 -; GFX10-NEXT: s_mov_b32 s1, s7 -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_mov_b32 s4, s2 +; GFX10-NEXT: s_mov_b32 s5, s3 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_branch .LBB15_6 ; GFX10-NEXT: .LBB15_5: -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s7 ; GFX10-NEXT: .LBB15_6: ; %endif ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul32_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: s_mul_i32 s5, s0, s1 +; GFX11-NEXT: s_mul_i32 s7, s0, s1 ; GFX11-NEXT: s_branch .LBB15_3 ; GFX11-NEXT: .LBB15_2: -; GFX11-NEXT: s_mov_b32 s4, -1 -; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: ; implicit-def: $sgpr7 ; GFX11-NEXT: .LBB15_3: ; %Flow -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX11-NEXT: ; %bb.4: ; %if ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 @@ -2243,7 +2242,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_branch .LBB15_6 ; GFX11-NEXT: .LBB15_5: -; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s7 ; GFX11-NEXT: .LBB15_6: ; %endif ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -2254,20 +2253,20 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul32_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u32 s0, 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX12-NEXT: ; %bb.1: ; %else -; GFX12-NEXT: s_mul_i32 s5, s0, s1 +; GFX12-NEXT: s_mul_i32 s7, s0, s1 ; GFX12-NEXT: s_branch .LBB15_3 ; GFX12-NEXT: .LBB15_2: -; GFX12-NEXT: s_mov_b32 s4, -1 -; GFX12-NEXT: ; implicit-def: $sgpr5 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: ; implicit-def: $sgpr7 ; GFX12-NEXT: .LBB15_3: ; %Flow -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX12-NEXT: ; %bb.4: ; %if ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 @@ -2278,7 +2277,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-NEXT: s_branch .LBB15_6 ; GFX12-NEXT: .LBB15_5: -; GFX12-NEXT: v_mov_b32_e32 v0, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s7 ; GFX12-NEXT: .LBB15_6: ; %endif ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 @@ -2344,7 +2343,7 @@ endif: define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; SI-LABEL: mul64_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -2379,7 +2378,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: mul64_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2411,25 +2410,25 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_mul_i32 s2, s8, s11 -; GFX9-NEXT: s_mul_hi_u32 s3, s8, s10 +; GFX9-NEXT: s_mul_i32 s2, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s3, s12, s14 ; GFX9-NEXT: s_add_i32 s2, s3, s2 -; GFX9-NEXT: s_mul_i32 s3, s9, s10 +; GFX9-NEXT: s_mul_i32 s3, s13, s14 ; GFX9-NEXT: s_add_i32 s3, s2, s3 -; GFX9-NEXT: s_mul_i32 s2, s8, s10 +; GFX9-NEXT: s_mul_i32 s2, s12, s14 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX9-NEXT: s_cbranch_vccnz .LBB16_4 ; GFX9-NEXT: .LBB16_2: ; %if ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s0, s6 -; GFX9-NEXT: s_mov_b32 s1, s7 +; GFX9-NEXT: s_mov_b32 s0, s10 +; GFX9-NEXT: s_mov_b32 s1, s11 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_branch .LBB16_5 ; GFX9-NEXT: .LBB16_3: @@ -2439,31 +2438,31 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: .LBB16_5: ; %endif -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX10-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_mul_i32 s0, s8, s11 -; GFX10-NEXT: s_mul_hi_u32 s1, s8, s10 -; GFX10-NEXT: s_mul_i32 s2, s9, s10 +; GFX10-NEXT: s_mul_i32 s0, s12, s15 +; GFX10-NEXT: s_mul_hi_u32 s1, s12, s14 +; GFX10-NEXT: s_mul_i32 s2, s13, s14 ; GFX10-NEXT: s_add_i32 s0, s1, s0 ; GFX10-NEXT: s_add_i32 s1, s0, s2 -; GFX10-NEXT: s_mul_i32 s0, s8, s10 +; GFX10-NEXT: s_mul_i32 s0, s12, s14 ; GFX10-NEXT: s_cbranch_execnz .LBB16_4 ; GFX10-NEXT: .LBB16_2: ; %if ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: s_mov_b32 s0, s6 -; GFX10-NEXT: s_mov_b32 s1, s7 +; GFX10-NEXT: s_mov_b32 s0, s10 +; GFX10-NEXT: s_mov_b32 s1, s11 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_branch .LBB16_5 ; GFX10-NEXT: .LBB16_3: @@ -2473,15 +2472,15 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: .LBB16_5: ; %endif -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2514,7 +2513,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul64_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2605,48 +2604,48 @@ endif: define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { ; SI-LABEL: s_mul_i128: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x1f -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x1f +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mul_hi_u32 v0, s8, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_mul_hi_u32 v1, s10, v1 -; SI-NEXT: s_mul_i32 s7, s8, s7 -; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: s_mul_i32 s7, s10, s5 -; SI-NEXT: s_mul_i32 s12, s9, s6 -; SI-NEXT: s_mul_i32 s6, s8, s6 -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: s_mul_i32 s7, s11, s4 -; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: s_mul_i32 s7, s10, s4 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; SI-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mul_hi_u32 v0, s12, v0 ; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mul_hi_u32 v5, s4, v1 -; SI-NEXT: v_mul_hi_u32 v1, s5, v1 -; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: v_mul_hi_u32 v4, s4, v3 -; SI-NEXT: s_mul_i32 s7, s5, s8 -; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; SI-NEXT: s_mul_i32 s6, s4, s9 +; SI-NEXT: v_mul_hi_u32 v1, s14, v1 +; SI-NEXT: s_mul_i32 s4, s12, s11 +; SI-NEXT: s_mul_i32 s5, s13, s10 +; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s5, v0 +; SI-NEXT: s_mul_i32 s5, s14, s9 +; SI-NEXT: s_mul_i32 s4, s12, s10 +; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1 +; SI-NEXT: s_mul_i32 s5, s15, s8 +; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1 +; SI-NEXT: s_mul_i32 s5, s14, s8 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_add_i32_e32 v2, vcc, s5, v2 +; SI-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mul_hi_u32 v5, s8, v1 +; SI-NEXT: v_mul_hi_u32 v1, s9, v1 +; SI-NEXT: v_mov_b32_e32 v3, s13 +; SI-NEXT: v_mul_hi_u32 v4, s8, v3 +; SI-NEXT: s_mul_i32 s5, s9, s12 +; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v5 +; SI-NEXT: s_mul_i32 s4, s8, s13 ; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v5 -; SI-NEXT: v_mul_hi_u32 v3, s5, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s4, v5 ; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; SI-NEXT: s_mul_i32 s5, s5, s9 -; SI-NEXT: v_addc_u32_e64 v5, s[6:7], 0, 0, vcc -; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4 +; SI-NEXT: v_mul_hi_u32 v3, s9, v3 +; SI-NEXT: v_addc_u32_e64 v5, s[4:5], 0, 0, vcc +; SI-NEXT: s_mul_i32 s4, s9, s13 +; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; SI-NEXT: s_mul_i32 s4, s4, s8 +; SI-NEXT: s_mul_i32 s4, s8, s12 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -2654,36 +2653,36 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; VI-LABEL: s_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x7c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0 -; VI-NEXT: s_mul_i32 s7, s8, s7 -; VI-NEXT: v_mov_b32_e32 v6, s8 -; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3 -; VI-NEXT: s_mul_i32 s12, s9, s6 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0 -; VI-NEXT: v_add_u32_e32 v3, vcc, s12, v3 +; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s12, v0, 0 +; VI-NEXT: s_mul_i32 s4, s12, s11 +; VI-NEXT: v_mov_b32_e32 v6, s12 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v6, 0 +; VI-NEXT: s_mul_i32 s6, s13, s10 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 ; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5] -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3] +; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v6, v[4:5] +; VI-NEXT: v_mov_b32_e32 v8, s8 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s14, v8, v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v7 ; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: v_mov_b32_e32 v8, s9 -; VI-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7] -; VI-NEXT: s_mul_i32 s8, s11, s4 -; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v2 +; VI-NEXT: v_mov_b32_e32 v8, s13 +; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s8, v8, v[6:7] +; VI-NEXT: s_mul_i32 s6, s15, s8 +; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v2 ; VI-NEXT: v_mov_b32_e32 v2, v5 ; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; VI-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc -; VI-NEXT: s_mul_i32 s8, s10, s5 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3] -; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v6 +; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc +; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s9, v8, v[2:3] +; VI-NEXT: s_mul_i32 s6, s14, s9 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v6 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc @@ -2693,193 +2692,193 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; GFX9-LABEL: s_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4c +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x7c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s7, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s12, s8, s6 -; GFX9-NEXT: s_add_i32 s7, s12, s7 -; GFX9-NEXT: s_mul_i32 s12, s9, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s12 -; GFX9-NEXT: s_mul_i32 s12, s10, s5 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s11, s11, s4 -; GFX9-NEXT: s_mul_i32 s6, s8, s6 -; GFX9-NEXT: s_add_i32 s12, s12, s11 -; GFX9-NEXT: s_mul_i32 s10, s10, s4 -; GFX9-NEXT: s_add_u32 s10, s10, s6 -; GFX9-NEXT: s_addc_u32 s11, s12, s7 -; GFX9-NEXT: s_mul_i32 s14, s5, s8 -; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s13, s5, s8 +; GFX9-NEXT: s_mul_i32 s4, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s5, s12, s10 +; GFX9-NEXT: s_mul_i32 s6, s14, s9 +; GFX9-NEXT: s_mul_hi_u32 s7, s14, s8 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s5, s13, s10 +; GFX9-NEXT: s_add_i32 s6, s7, s6 +; GFX9-NEXT: s_mul_i32 s7, s15, s8 +; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_i32 s5, s12, s10 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_i32 s7, s14, s8 +; GFX9-NEXT: s_add_u32 s7, s7, s5 +; GFX9-NEXT: s_addc_u32 s6, s6, s4 +; GFX9-NEXT: s_mul_i32 s14, s9, s12 +; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_i32 s7, s4, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9 -; GFX9-NEXT: s_add_u32 s7, s7, s14 -; GFX9-NEXT: s_addc_u32 s12, s12, 0 -; GFX9-NEXT: s_add_u32 s12, s13, s12 -; GFX9-NEXT: s_addc_u32 s13, 0, 0 -; GFX9-NEXT: s_mul_hi_u32 s14, s5, s9 -; GFX9-NEXT: s_mul_i32 s5, s5, s9 -; GFX9-NEXT: s_add_u32 s5, s5, s12 -; GFX9-NEXT: s_mov_b32 s6, 0 -; GFX9-NEXT: s_addc_u32 s9, s14, s13 -; GFX9-NEXT: s_add_u32 s10, s5, s10 -; GFX9-NEXT: s_mul_i32 s4, s4, s8 -; GFX9-NEXT: s_mov_b32 s5, s6 -; GFX9-NEXT: s_addc_u32 s9, s9, s11 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: s_mul_i32 s5, s8, s13 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 +; GFX9-NEXT: s_add_u32 s5, s5, s14 +; GFX9-NEXT: s_addc_u32 s10, s10, 0 +; GFX9-NEXT: s_add_u32 s10, s11, s10 +; GFX9-NEXT: s_addc_u32 s11, 0, 0 +; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 +; GFX9-NEXT: s_mul_i32 s9, s9, s13 +; GFX9-NEXT: s_add_u32 s9, s9, s10 +; GFX9-NEXT: s_addc_u32 s10, s14, s11 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_add_u32 s9, s9, s7 +; GFX9-NEXT: s_addc_u32 s10, s10, s6 +; GFX9-NEXT: s_mul_i32 s6, s8, s12 +; GFX9-NEXT: s_mov_b32 s7, s4 +; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i128: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s12, 0 -; GFX10-NEXT: s_mov_b32 s3, s12 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x7c +; GFX10-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: s_mov_b32 s5, s6 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s8, s7 -; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6 -; GFX10-NEXT: s_mul_i32 s14, s10, s5 -; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4 -; GFX10-NEXT: s_mul_i32 s13, s9, s6 -; GFX10-NEXT: s_mul_i32 s11, s11, s4 -; GFX10-NEXT: s_add_i32 s2, s7, s2 -; GFX10-NEXT: s_add_i32 s7, s15, s14 -; GFX10-NEXT: s_mul_i32 s6, s8, s6 -; GFX10-NEXT: s_mul_i32 s10, s10, s4 -; GFX10-NEXT: s_add_i32 s2, s2, s13 -; GFX10-NEXT: s_add_i32 s7, s7, s11 -; GFX10-NEXT: s_mul_i32 s19, s5, s8 -; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8 -; GFX10-NEXT: s_add_u32 s6, s10, s6 -; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8 -; GFX10-NEXT: s_addc_u32 s7, s7, s2 -; GFX10-NEXT: s_mul_i32 s17, s4, s9 -; GFX10-NEXT: s_add_u32 s2, s19, s20 -; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9 -; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9 -; GFX10-NEXT: s_mul_i32 s5, s5, s9 +; GFX10-NEXT: s_mul_i32 s3, s8, s3 +; GFX10-NEXT: s_mul_hi_u32 s4, s8, s2 +; GFX10-NEXT: s_mul_i32 s14, s10, s1 +; GFX10-NEXT: s_mul_hi_u32 s15, s10, s0 +; GFX10-NEXT: s_mul_i32 s7, s9, s2 +; GFX10-NEXT: s_mul_i32 s11, s11, s0 +; GFX10-NEXT: s_add_i32 s3, s4, s3 +; GFX10-NEXT: s_add_i32 s4, s15, s14 +; GFX10-NEXT: s_mul_i32 s2, s8, s2 +; GFX10-NEXT: s_mul_i32 s10, s10, s0 +; GFX10-NEXT: s_add_i32 s3, s3, s7 +; GFX10-NEXT: s_add_i32 s4, s4, s11 +; GFX10-NEXT: s_mul_i32 s19, s1, s8 +; GFX10-NEXT: s_mul_hi_u32 s20, s0, s8 +; GFX10-NEXT: s_add_u32 s2, s10, s2 +; GFX10-NEXT: s_mul_hi_u32 s18, s1, s8 +; GFX10-NEXT: s_addc_u32 s3, s4, s3 +; GFX10-NEXT: s_mul_i32 s17, s0, s9 +; GFX10-NEXT: s_add_u32 s4, s19, s20 +; GFX10-NEXT: s_mul_hi_u32 s16, s0, s9 +; GFX10-NEXT: s_mul_hi_u32 s21, s1, s9 +; GFX10-NEXT: s_mul_i32 s1, s1, s9 ; GFX10-NEXT: s_addc_u32 s9, s18, 0 -; GFX10-NEXT: s_add_u32 s13, s17, s2 +; GFX10-NEXT: s_add_u32 s7, s17, s4 ; GFX10-NEXT: s_addc_u32 s10, s16, 0 -; GFX10-NEXT: s_mul_i32 s2, s4, s8 -; GFX10-NEXT: s_add_u32 s4, s9, s10 +; GFX10-NEXT: s_mul_i32 s4, s0, s8 +; GFX10-NEXT: s_add_u32 s0, s9, s10 ; GFX10-NEXT: s_addc_u32 s8, 0, 0 -; GFX10-NEXT: s_add_u32 s4, s5, s4 -; GFX10-NEXT: s_addc_u32 s5, s21, s8 -; GFX10-NEXT: s_add_u32 s4, s4, s6 -; GFX10-NEXT: s_addc_u32 s5, s5, s7 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX10-NEXT: s_add_u32 s0, s1, s0 +; GFX10-NEXT: s_addc_u32 s1, s21, s8 +; GFX10-NEXT: s_add_u32 s2, s0, s2 +; GFX10-NEXT: s_addc_u32 s3, s1, s3 +; GFX10-NEXT: s_or_b64 s[0:1], s[4:5], s[6:7] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_mov_b32 s15, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, -1 +; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_i128: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x4c -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x7c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s12, 0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x4c +; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x7c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s3, s12 +; GFX11-NEXT: s_mov_b32 s13, s6 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s8, s7 -; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6 -; GFX11-NEXT: s_mul_i32 s14, s10, s5 -; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4 -; GFX11-NEXT: s_mul_i32 s13, s9, s6 -; GFX11-NEXT: s_mul_i32 s11, s11, s4 -; GFX11-NEXT: s_add_i32 s2, s7, s2 +; GFX11-NEXT: s_mul_i32 s3, s8, s3 +; GFX11-NEXT: s_mul_hi_u32 s7, s8, s2 +; GFX11-NEXT: s_mul_i32 s14, s10, s1 +; GFX11-NEXT: s_mul_hi_u32 s15, s10, s0 +; GFX11-NEXT: s_mul_i32 s12, s9, s2 +; GFX11-NEXT: s_mul_i32 s11, s11, s0 +; GFX11-NEXT: s_add_i32 s3, s7, s3 ; GFX11-NEXT: s_add_i32 s7, s15, s14 -; GFX11-NEXT: s_mul_i32 s6, s8, s6 -; GFX11-NEXT: s_mul_i32 s10, s10, s4 -; GFX11-NEXT: s_add_i32 s2, s2, s13 +; GFX11-NEXT: s_mul_i32 s2, s8, s2 +; GFX11-NEXT: s_mul_i32 s10, s10, s0 +; GFX11-NEXT: s_add_i32 s3, s3, s12 ; GFX11-NEXT: s_add_i32 s7, s7, s11 -; GFX11-NEXT: s_mul_i32 s19, s5, s8 -; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8 -; GFX11-NEXT: s_add_u32 s6, s10, s6 -; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8 -; GFX11-NEXT: s_addc_u32 s7, s7, s2 -; GFX11-NEXT: s_mul_i32 s17, s4, s9 -; GFX11-NEXT: s_add_u32 s2, s19, s20 -; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9 -; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9 -; GFX11-NEXT: s_mul_i32 s5, s5, s9 +; GFX11-NEXT: s_mul_i32 s19, s1, s8 +; GFX11-NEXT: s_mul_hi_u32 s20, s0, s8 +; GFX11-NEXT: s_add_u32 s2, s10, s2 +; GFX11-NEXT: s_mul_hi_u32 s18, s1, s8 +; GFX11-NEXT: s_addc_u32 s3, s7, s3 +; GFX11-NEXT: s_mul_i32 s17, s0, s9 +; GFX11-NEXT: s_add_u32 s7, s19, s20 +; GFX11-NEXT: s_mul_hi_u32 s16, s0, s9 +; GFX11-NEXT: s_mul_hi_u32 s21, s1, s9 +; GFX11-NEXT: s_mul_i32 s1, s1, s9 ; GFX11-NEXT: s_addc_u32 s9, s18, 0 -; GFX11-NEXT: s_add_u32 s13, s17, s2 +; GFX11-NEXT: s_add_u32 s7, s17, s7 ; GFX11-NEXT: s_addc_u32 s10, s16, 0 -; GFX11-NEXT: s_mul_i32 s2, s4, s8 -; GFX11-NEXT: s_add_u32 s4, s9, s10 +; GFX11-NEXT: s_mul_i32 s12, s0, s8 +; GFX11-NEXT: s_add_u32 s0, s9, s10 ; GFX11-NEXT: s_addc_u32 s8, 0, 0 -; GFX11-NEXT: s_add_u32 s4, s5, s4 -; GFX11-NEXT: s_addc_u32 s5, s21, s8 -; GFX11-NEXT: s_add_u32 s4, s4, s6 -; GFX11-NEXT: s_addc_u32 s5, s5, s7 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] +; GFX11-NEXT: s_add_u32 s0, s1, s0 +; GFX11-NEXT: s_addc_u32 s1, s21, s8 +; GFX11-NEXT: s_add_u32 s2, s0, s2 +; GFX11-NEXT: s_addc_u32 s3, s1, s3 +; GFX11-NEXT: s_or_b64 s[0:1], s[12:13], s[6:7] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_i128: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x7c -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x4c -; GFX12-NEXT: s_mov_b32 s13, 0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b32 s15, s13 -; GFX12-NEXT: s_mov_b32 s3, s13 -; GFX12-NEXT: s_mov_b32 s17, s13 -; GFX12-NEXT: s_mov_b32 s19, s13 -; GFX12-NEXT: s_mov_b32 s24, s13 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x7c +; GFX12-NEXT: s_load_b128 s[12:15], s[4:5], 0x4c +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s7, s3 +; GFX12-NEXT: s_mov_b32 s5, s3 +; GFX12-NEXT: s_mov_b32 s17, s3 +; GFX12-NEXT: s_mov_b32 s19, s3 +; GFX12-NEXT: s_mov_b32 s24, s3 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s4 -; GFX12-NEXT: s_mov_b32 s14, s8 -; GFX12-NEXT: s_mov_b32 s2, s9 -; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[12:13] -; GFX12-NEXT: s_mul_u64 s[20:21], s[2:3], s[12:13] -; GFX12-NEXT: s_mov_b32 s12, s23 -; GFX12-NEXT: s_mov_b32 s16, s5 -; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11] -; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[12:13] -; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9] -; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17] -; GFX12-NEXT: s_mov_b32 s12, s11 -; GFX12-NEXT: s_mov_b32 s11, s13 -; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5] -; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11] -; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[16:17] +; GFX12-NEXT: s_mov_b32 s2, s8 +; GFX12-NEXT: s_mov_b32 s6, s12 +; GFX12-NEXT: s_mov_b32 s4, s13 +; GFX12-NEXT: s_mul_u64 s[22:23], s[6:7], s[2:3] +; GFX12-NEXT: s_mul_u64 s[20:21], s[4:5], s[2:3] +; GFX12-NEXT: s_mov_b32 s2, s23 +; GFX12-NEXT: s_mov_b32 s16, s9 +; GFX12-NEXT: s_mul_u64 s[10:11], s[10:11], s[12:13] +; GFX12-NEXT: s_add_nc_u64 s[12:13], s[20:21], s[2:3] +; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[16:17] +; GFX12-NEXT: s_mov_b32 s2, s13 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mul_u64 s[8:9], s[8:9], s[14:15] +; GFX12-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[12:13] +; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[16:17] ; GFX12-NEXT: s_mov_b32 s18, s7 +; GFX12-NEXT: s_mov_b32 s23, s3 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] +; GFX12-NEXT: s_add_nc_u64 s[8:9], s[10:11], s[8:9] ; GFX12-NEXT: s_mov_b32 s25, s6 -; GFX12-NEXT: s_add_nc_u64 s[6:7], s[12:13], s[18:19] -; GFX12-NEXT: s_mov_b32 s23, s13 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7] -; GFX12-NEXT: s_or_b64 s[8:9], s[22:23], s[24:25] -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] +; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25] +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[8:9] +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 @@ -2944,7 +2943,7 @@ entry: define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { ; SI-LABEL: v_mul_i128: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0 @@ -2993,7 +2992,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; VI-LABEL: v_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; VI-NEXT: v_mov_b32_e32 v11, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3033,12 +3032,12 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 ; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 @@ -3059,18 +3058,18 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX9-NEXT: v_add3_u32 v3, v16, v3, v4 ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i128: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0 ; GFX10-NEXT: v_mul_lo_u32 v15, v5, v2 @@ -3091,12 +3090,12 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX10-NEXT: v_add3_u32 v3, v7, v3, v12 ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[6:7] +; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i128: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 @@ -3134,7 +3133,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX12-LABEL: v_mul_i128: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 8f4c48fae6fb31..4302810089f0c9 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s2, s2, 0x180000 @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_smul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,17 +39,17 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_smul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 -; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 +; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i32: @@ -100,7 +100,7 @@ entry: define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smulhi24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_smulhi24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -126,17 +126,17 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_smulhi24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 -; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 +; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smulhi24_i64: @@ -274,31 +274,31 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 { ; SI-LABEL: test_smul24_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dword s4, s[4:5], 0x1c ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_i32 s5, s6, 0x180000 ; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 -; SI-NEXT: s_bfe_i32 s5, s5, 0x180000 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_mul_i32 s4, s5, s4 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_mul_i32 s5, s4, s5 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dword s5, s[2:3], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c +; VI-NEXT: s_load_dword s7, s[4:5], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 -; VI-NEXT: s_bfe_i32 s5, s5, 0x180000 +; VI-NEXT: s_bfe_i32 s4, s6, 0x180000 +; VI-NEXT: s_bfe_i32 s5, s7, 0x180000 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 ; VI-NEXT: v_mul_i32_i24_e32 v0, s5, v0 @@ -307,14 +307,14 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 ; ; GFX9-LABEL: test_smul24_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x4c +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GFX9-NEXT: s_bfe_i32 s5, s5, 0x180000 +; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000 ; GFX9-NEXT: s_mul_hi_i32 s6, s5, s4 ; GFX9-NEXT: s_mul_i32 s5, s5, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -376,12 +376,12 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i64_square: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; SI-NEXT: s_bfe_i32 s4, s6, 0x180000 ; SI-NEXT: s_mul_i32 s5, s4, s4 ; SI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4 ; SI-NEXT: v_mov_b32_e32 v0, s5 @@ -390,12 +390,12 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_smul24_i64_square: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; VI-NEXT: s_bfe_i32 s4, s6, 0x180000 ; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4 ; VI-NEXT: v_mul_i32_i24_e64 v0, s4, s4 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -403,12 +403,12 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_smul24_i64_square: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 ; GFX9-NEXT: s_mul_hi_i32 s5, s4, s4 ; GFX9-NEXT: s_mul_i32 s4, s4, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -463,14 +463,14 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 { ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dword s6, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dword s4, s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s4, 8 -; SI-NEXT: s_lshl_b32 s7, s6, 8 +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_lshl_b32 s7, s4, 8 ; SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 ; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; SI-NEXT: v_mov_b32_e32 v0, s6 @@ -484,12 +484,12 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dword s5, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s4, 8 -; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_lshl_b32 s3, s2, 8 +; VI-NEXT: s_lshl_b32 s5, s6, 8 ; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -504,15 +504,15 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; GFX9-LABEL: test_smul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x34 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s4, 8 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; GFX9-NEXT: s_lshl_b32 s5, s6, 8 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GFX9-NEXT: s_lshl_b32 s5, s7, 8 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 ; GFX9-NEXT: s_mul_hi_i32 s5, s4, s6 ; GFX9-NEXT: s_mul_i32 s4, s4, s6 @@ -580,43 +580,43 @@ entry: define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_smulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dword s5, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dword s7, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dword s5, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dword s7, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x34 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s4, 8 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; GFX9-NEXT: s_lshl_b32 s5, s6, 8 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GFX9-NEXT: s_lshl_b32 s5, s7, 8 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 ; GFX9-NEXT: s_mul_hi_i32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 1 @@ -672,68 +672,68 @@ entry: define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) { ; SI-LABEL: simplify_i24_crash: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %bb7 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB8_2: ; %bb11 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s2, s4, 0x180000 -; SI-NEXT: s_bfe_i32 s4, s6, 0x180000 -; SI-NEXT: s_mul_i32 s4, s2, s4 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_bfe_i32 s0, s0, 0x180000 +; SI-NEXT: s_bfe_i32 s1, s2, 0x180000 +; SI-NEXT: s_mul_i32 s0, s0, s1 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: simplify_i24_crash: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s0, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %bb7 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB8_2: ; %bb11 -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 -; VI-NEXT: s_bfe_i32 s5, s6, 0x180000 -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_bfe_i32 s0, s0, 0x180000 +; VI-NEXT: s_bfe_i32 s1, s2, 0x180000 +; VI-NEXT: s_mul_i32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: simplify_i24_crash: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %bb7 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB8_2: ; %bb11 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GFX9-NEXT: s_bfe_i32 s5, s6, 0x180000 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GFX9-NEXT: s_bfe_i32 s1, s2, 0x180000 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: simplify_i24_crash: @@ -817,7 +817,7 @@ bb7: define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: test_umul_i24: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s1, s[2:3], 0xb +; SI-NEXT: s_load_dword s1, s[4:5], 0xb ; SI-NEXT: v_mov_b32_e32 v0, 0xff803fe1 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -833,7 +833,7 @@ define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) { ; ; VI-LABEL: test_umul_i24: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s0, s[4:5], 0x2c ; VI-NEXT: v_mov_b32_e32 v0, 0xff803fe1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -849,7 +849,7 @@ define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) { ; ; GFX9-LABEL: test_umul_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index a1099554559afa..864bc0bc2776f8 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffffff @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_umul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,17 +39,17 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_umul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff -; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %0 = shl i32 %a, 8 @@ -64,13 +64,13 @@ entry: define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i16 %b) { ; SI-LABEL: test_umul24_i16_sext: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s4, 16 -; SI-NEXT: s_mul_i32 s4, s4, s2 -; SI-NEXT: s_sext_i32_i16 s4, s4 +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: s_mul_i32 s2, s2, s4 +; SI-NEXT: s_sext_i32_i16 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -78,28 +78,28 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i ; ; VI-LABEL: test_umul24_i16_sext: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s4, 16 -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: s_mul_i32 s6, s6, s4 +; VI-NEXT: s_sext_i32_i16 s4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i16_sext: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s4, s4 +; GFX9-NEXT: s_lshr_b32 s4, s6, 16 +; GFX9-NEXT: s_mul_i32 s6, s6, s4 +; GFX9-NEXT: s_sext_i32_i16 s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -113,7 +113,7 @@ entry: define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_umul24_i16_vgpr_sext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 @@ -136,7 +136,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; VI-LABEL: test_umul24_i16_vgpr_sext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -158,18 +158,18 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: test_umul24_i16_vgpr_sext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v3, v1, s[6:7] -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v3, v1, s[2:3] +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() @@ -186,13 +186,13 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b) { ; SI-LABEL: test_umul24_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s4, 16 -; SI-NEXT: s_mul_i32 s4, s4, s2 -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: s_mul_i32 s2, s2, s4 +; SI-NEXT: s_and_b32 s4, s2, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -200,28 +200,28 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b ; ; VI-LABEL: test_umul24_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s4, 16 -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: s_mul_i32 s6, s6, s4 +; VI-NEXT: s_and_b32 s4, s6, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshr_b32 s4, s6, 16 +; GFX9-NEXT: s_mul_i32 s6, s6, s4 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -235,7 +235,7 @@ entry: define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_umul24_i16_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 @@ -258,7 +258,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_umul24_i16_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -279,17 +279,17 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_umul24_i16_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v3, v1, s[6:7] -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v3, v1, s[2:3] +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() @@ -307,21 +307,21 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; SI-LABEL: test_umul24_i8_vgpr: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_mov_b64 s[6:7], s[14:15] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: buffer_load_ubyte v0, v[3:4], s[12:15], 0 addr64 -; SI-NEXT: buffer_load_ubyte v1, v[1:2], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ubyte v1, v[1:2], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 8 @@ -330,38 +330,38 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: test_umul24_i8_vgpr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v1 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i8_vgpr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v3, v1, s[0:1] -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %tid.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -379,7 +379,7 @@ entry: define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi24_i32_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -392,7 +392,7 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_umulhi24_i32_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -405,17 +405,17 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_umulhi24_i32_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff -; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %a.24 = and i32 %a, 16777215 @@ -432,48 +432,49 @@ entry: define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: test_umulhi24: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dword s3, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s6, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umulhi24: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_load_dword s3, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s6, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umulhi24: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffffff -; GFX9-NEXT: s_and_b32 s0, s0, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %a.24 = and i64 %a, 16777215 @@ -489,55 +490,56 @@ entry: define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: test_umul24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_and_b32 s4, s6, 0xffffff +; SI-NEXT: s_load_dword s3, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_and_b32 s0, s2, 0xffffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s5, s7, 0xffffff -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: s_mul_i32 s4, s4, s5 -; SI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_and_b32 s1, s3, 0xffffff +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_mul_i32 s0, s0, s1 +; SI-NEXT: v_mul_hi_u32_u24_e32 v1, s2, v0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umul24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_load_dword s3, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0 -; VI-NEXT: v_mul_u32_u24_e32 v0, s6, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s2, v0 +; VI-NEXT: v_mul_u32_u24_e32 v0, s2, v0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffffff -; GFX9-NEXT: s_and_b32 s0, s0, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s2, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s1, s0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s2, s0, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i64 %a, 40 @@ -580,38 +582,38 @@ define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: test_umul24_i64_square: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s5, s4, 0xffffff -; SI-NEXT: s_mul_i32 s5, s5, s5 -; SI-NEXT: v_mul_hi_u32_u24_e64 v1, s4, s4 -; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_and_b32 s4, s6, 0xffffff +; SI-NEXT: s_mul_i32 s4, s4, s4 +; SI-NEXT: v_mul_hi_u32_u24_e64 v1, s6, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umul24_i64_square: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s4, s4 -; VI-NEXT: v_mul_u32_u24_e64 v0, s4, s4 +; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s6, s6 +; VI-NEXT: v_mul_u32_u24_e64 v0, s6, s6 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i64_square: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff ; GFX9-NEXT: s_mul_hi_u32 s5, s4, s4 ; GFX9-NEXT: s_mul_i32 s4, s4, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -629,7 +631,7 @@ entry: define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi16_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffff @@ -645,7 +647,7 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_umulhi16_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -661,14 +663,14 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_umulhi16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s6, 0xffff -; GFX9-NEXT: s_and_b32 s1, s7, 0xffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: s_mul_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: %a.16 = and i32 %a, 65535 @@ -683,47 +685,47 @@ entry: define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_umul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dword s5, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dword s4, s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s6, s4, 0xffffff -; SI-NEXT: s_and_b32 s7, s5, 0xffffff -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s4, v0 -; SI-NEXT: s_mul_i32 s6, s6, s7 +; SI-NEXT: s_and_b32 s5, s6, 0xffffff +; SI-NEXT: s_and_b32 s7, s4, 0xffffff +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s6, v0 +; SI-NEXT: s_mul_i32 s5, s5, s7 ; SI-NEXT: v_and_b32_e32 v1, 1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dword s5, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dword s7, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mul_u32_u24_e32 v0, s5, v1 -; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s5, v1 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mul_u32_u24_e32 v0, s7, v1 +; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s7, v1 ; VI-NEXT: v_and_b32_e32 v1, 1, v1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff -; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff ; GFX9-NEXT: s_mul_i32 s6, s4, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s4, 1 @@ -745,42 +747,42 @@ entry: define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_umulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dword s5, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dword s7, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s5, v0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s7, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dword s5, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dword s7, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s5, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s7, v0 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff -; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff ; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll index 727b607e7ded06..65c44768d3d88b 100644 --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -163,7 +163,7 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: multi_if_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll index 69971bca2738ad..0e750d879ac944 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -67,23 +67,25 @@ define amdgpu_kernel void @kernel_call() { ; CHECK-LABEL: kernel_call: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] +; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v3, s16, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: v_readlane_b32 s14, v3, 0 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, csr_vgpr_spill_fp_callee@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, csr_vgpr_spill_fp_callee@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] -; CHECK-NEXT: s_mov_b32 s6, 20 -; CHECK-NEXT: v_lshlrev_b32_e64 v2, s6, v2 -; CHECK-NEXT: s_mov_b32 s6, 10 -; CHECK-NEXT: v_lshlrev_b32_e64 v1, s6, v1 +; CHECK-NEXT: s_mov_b32 s15, 20 +; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2 +; CHECK-NEXT: s_mov_b32 s15, 10 +; CHECK-NEXT: v_lshlrev_b32_e64 v1, s15, v1 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] @@ -126,23 +128,25 @@ define amdgpu_kernel void @kernel_tailcall() { ; CHECK-LABEL: kernel_tailcall: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] +; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v3, s16, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: v_readlane_b32 s14, v3, 0 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] -; CHECK-NEXT: s_mov_b32 s6, 20 -; CHECK-NEXT: v_lshlrev_b32_e64 v2, s6, v2 -; CHECK-NEXT: s_mov_b32 s6, 10 -; CHECK-NEXT: v_lshlrev_b32_e64 v1, s6, v1 +; CHECK-NEXT: s_mov_b32 s15, 20 +; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2 +; CHECK-NEXT: s_mov_b32 s15, 10 +; CHECK-NEXT: v_lshlrev_b32_e64 v1, s15, v1 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] @@ -238,23 +242,25 @@ define protected amdgpu_kernel void @kernel() { ; CHECK-LABEL: kernel: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] +; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v3, s16, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: v_readlane_b32 s14, v3, 0 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, caller_save_vgpr_spill_fp@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, caller_save_vgpr_spill_fp@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] -; CHECK-NEXT: s_mov_b32 s6, 20 -; CHECK-NEXT: v_lshlrev_b32_e64 v2, s6, v2 -; CHECK-NEXT: s_mov_b32 s6, 10 -; CHECK-NEXT: v_lshlrev_b32_e64 v1, s6, v1 +; CHECK-NEXT: s_mov_b32 s15, 20 +; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2 +; CHECK-NEXT: s_mov_b32 s15, 10 +; CHECK-NEXT: v_lshlrev_b32_e64 v1, s15, v1 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll index 296d484e247d6e..a52d9ff526c2ae 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocapture %arg) #0 { ; GCN-LABEL: reduced_nested_loop_conditions: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: s_mov_b32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index b84686139d0e2c..85096eb63f46e1 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -120,39 +120,39 @@ bb.2: ; ASSUME1024: ; ScratchSize: 1040 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { -; DEFAULTSIZE-V5-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: -; DEFAULTSIZE-V5: ; %bb.0: ; %entry -; DEFAULTSIZE-V5-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 -; DEFAULTSIZE-V5-NEXT: s_add_u32 s0, s0, s15 -; DEFAULTSIZE-V5-NEXT: s_addc_u32 s1, s1, 0 -; DEFAULTSIZE-V5-NEXT: s_mov_b32 s33, 0 -; DEFAULTSIZE-V5-NEXT: s_movk_i32 s32, 0x1000 -; DEFAULTSIZE-V5-NEXT: s_waitcnt lgkmcnt(0) -; DEFAULTSIZE-V5-NEXT: s_cmp_lg_u32 s4, 0 -; DEFAULTSIZE-V5-NEXT: s_cbranch_scc1 .LBB1_2 -; DEFAULTSIZE-V5-NEXT: ; %bb.1: ; %bb.0 -; DEFAULTSIZE-V5-NEXT: s_add_i32 s4, s32, 0x1000 -; DEFAULTSIZE-V5-NEXT: s_and_b32 s4, s4, 0xfffff000 -; DEFAULTSIZE-V5-NEXT: s_lshl_b32 s5, s5, 2 -; DEFAULTSIZE-V5-NEXT: s_mov_b32 s32, s4 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v1, 0 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, s4 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v3, 1 -; DEFAULTSIZE-V5-NEXT: s_add_i32 s4, s4, s5 -; DEFAULTSIZE-V5-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; DEFAULTSIZE-V5-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, s4 -; DEFAULTSIZE-V5-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; DEFAULTSIZE-V5-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0) -; DEFAULTSIZE-V5-NEXT: v_add_u32_e32 v0, v2, v0 -; DEFAULTSIZE-V5-NEXT: s_waitcnt lgkmcnt(0) -; DEFAULTSIZE-V5-NEXT: global_store_dword v1, v0, s[4:5] -; DEFAULTSIZE-V5-NEXT: .LBB1_2: ; %bb.1 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v0, 0 -; DEFAULTSIZE-V5-NEXT: global_store_dword v[0:1], v0, off -; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0) -; DEFAULTSIZE-V5-NEXT: s_endpgm +; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; MUBUF-NEXT: s_add_u32 s0, s0, s17 +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; MUBUF-NEXT: s_mov_b32 s33, 0 +; MUBUF-NEXT: s_movk_i32 s32, 0x1000 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: s_cmp_lg_u32 s4, 0 +; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2 +; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: s_add_i32 s4, s32, 0x1000 +; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000 +; MUBUF-NEXT: s_lshl_b32 s5, s5, 2 +; MUBUF-NEXT: s_mov_b32 s32, s4 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; MUBUF-NEXT: v_mov_b32_e32 v2, s4 +; MUBUF-NEXT: v_mov_b32_e32 v3, 1 +; MUBUF-NEXT: s_add_i32 s4, s4, s5 +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_mov_b32_e32 v2, s4 +; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: global_store_dword v1, v0, s[4:5] +; MUBUF-NEXT: .LBB1_2: ; %bb.1 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_endpgm ; ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: ; FLATSCR: ; %bb.0: ; %entry @@ -409,3 +409,4 @@ attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amd ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ASSUME1024: {{.*}} ; DEFAULTSIZE: {{.*}} +; DEFAULTSIZE-V5: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index 31e8a49b16ca10..b8be5b300bb7b3 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -1696,7 +1696,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1707,7 +1707,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 1 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -1720,7 +1720,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:1 glc dlc @@ -1730,7 +1730,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1757,7 +1757,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -1770,7 +1770,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc @@ -1780,7 +1780,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS @@ -1796,7 +1796,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -1820,7 +1820,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc @@ -1830,7 +1830,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS @@ -1846,7 +1846,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1859,7 +1859,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -1872,7 +1872,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1884,7 +1884,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS @@ -1894,7 +1894,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -1907,7 +1907,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -1926,7 +1926,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1939,7 +1939,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff800 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -1952,7 +1952,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1964,7 +1964,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS @@ -1974,7 +1974,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -1987,7 +1987,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2006,7 +2006,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2019,7 +2019,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2032,7 +2032,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2044,7 +2044,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS @@ -2054,7 +2054,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2067,7 +2067,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2086,7 +2086,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2099,7 +2099,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2112,7 +2112,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2124,7 +2124,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS @@ -2134,7 +2134,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2147,7 +2147,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2166,7 +2166,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2177,7 +2177,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2190,7 +2190,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc @@ -2200,7 +2200,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS @@ -2216,7 +2216,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2229,7 +2229,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2242,7 +2242,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2254,7 +2254,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS @@ -2264,7 +2264,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2277,7 +2277,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2296,7 +2296,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2309,7 +2309,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x3fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2322,7 +2322,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x3000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2334,7 +2334,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS @@ -2344,7 +2344,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2357,7 +2357,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2376,7 +2376,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2389,7 +2389,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2402,7 +2402,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2414,7 +2414,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS @@ -2424,7 +2424,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2437,7 +2437,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2456,7 +2456,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2469,7 +2469,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2482,7 +2482,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2494,7 +2494,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS @@ -2504,7 +2504,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2517,7 +2517,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2536,7 +2536,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2549,7 +2549,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2562,7 +2562,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2574,7 +2574,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS @@ -2584,7 +2584,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2597,7 +2597,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2617,7 +2617,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 @@ -2629,7 +2629,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -2642,7 +2642,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2654,7 +2654,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2666,7 +2666,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2679,7 +2679,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2692,7 +2692,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -2712,7 +2712,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 @@ -2724,7 +2724,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x800 ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -2737,7 +2737,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2749,7 +2749,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2761,7 +2761,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2774,7 +2774,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2787,7 +2787,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -2807,7 +2807,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 @@ -2819,7 +2819,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -2832,7 +2832,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2844,7 +2844,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2856,7 +2856,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2869,7 +2869,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2882,7 +2882,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -2902,7 +2902,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2915,7 +2915,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -2928,7 +2928,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2940,7 +2940,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2952,7 +2952,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2965,7 +2965,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2978,7 +2978,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -2998,7 +2998,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -3011,7 +3011,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3024,7 +3024,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3036,7 +3036,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3048,7 +3048,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3061,7 +3061,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3074,7 +3074,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3094,7 +3094,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -3107,7 +3107,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3120,7 +3120,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3132,7 +3132,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3144,7 +3144,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3157,7 +3157,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3170,7 +3170,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3190,7 +3190,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3204,7 +3204,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3217,7 +3217,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s0 @@ -3230,7 +3230,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3243,7 +3243,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3256,7 +3256,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3269,7 +3269,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3289,7 +3289,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3303,7 +3303,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x800 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3316,7 +3316,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0 @@ -3329,7 +3329,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3342,7 +3342,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3355,7 +3355,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3368,7 +3368,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3388,7 +3388,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3402,7 +3402,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3415,7 +3415,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s0 @@ -3428,7 +3428,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3441,7 +3441,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3454,7 +3454,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3467,7 +3467,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3487,7 +3487,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3501,7 +3501,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3514,7 +3514,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0 @@ -3527,7 +3527,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3540,7 +3540,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3553,7 +3553,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3566,7 +3566,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3586,7 +3586,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3600,7 +3600,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3613,7 +3613,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s0 @@ -3626,7 +3626,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3639,7 +3639,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3652,7 +3652,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3665,7 +3665,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3685,7 +3685,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3699,7 +3699,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3712,7 +3712,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0 @@ -3725,7 +3725,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3738,7 +3738,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3751,7 +3751,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3764,7 +3764,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index 3a9cf9678d8461..4ebbb10fae187c 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -1858,7 +1858,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc @@ -1868,7 +1868,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; ; GFX10-LABEL: global_inst_salu_offset_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc dlc @@ -1878,7 +1878,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; ; GFX11-LABEL: global_inst_salu_offset_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc @@ -1888,7 +1888,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_salu_offset_1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS @@ -1904,7 +1904,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc @@ -1914,7 +1914,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; ; GFX10-LABEL: global_inst_salu_offset_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -1924,7 +1924,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; ; GFX11-LABEL: global_inst_salu_offset_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc @@ -1934,7 +1934,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; ; GFX12-LABEL: global_inst_salu_offset_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS @@ -1950,7 +1950,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -1960,7 +1960,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; ; GFX10-LABEL: global_inst_salu_offset_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -1970,7 +1970,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; ; GFX11-LABEL: global_inst_salu_offset_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -1980,7 +1980,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; ; GFX12-LABEL: global_inst_salu_offset_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS @@ -1996,7 +1996,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2006,7 +2006,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; ; GFX10-LABEL: global_inst_salu_offset_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2016,7 +2016,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; ; GFX11-LABEL: global_inst_salu_offset_13bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2026,7 +2026,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; ; GFX12-LABEL: global_inst_salu_offset_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS @@ -2042,7 +2042,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc @@ -2052,7 +2052,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; ; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc dlc @@ -2062,7 +2062,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; ; GFX11-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc @@ -2072,7 +2072,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; ; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS @@ -2088,7 +2088,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc @@ -2098,7 +2098,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2111,7 +2111,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX11-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc @@ -2121,7 +2121,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS @@ -2131,7 +2131,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2148,7 +2148,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000 @@ -2160,7 +2160,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2173,7 +2173,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2186,7 +2186,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS @@ -2196,7 +2196,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2207,7 +2207,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2225,7 +2225,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2235,7 +2235,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; ; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2245,7 +2245,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; ; GFX11-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2255,7 +2255,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; ; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS @@ -2271,7 +2271,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2281,7 +2281,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; ; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2291,7 +2291,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; ; GFX11-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2301,7 +2301,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; ; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS @@ -2317,7 +2317,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2327,7 +2327,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; ; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2337,7 +2337,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; ; GFX11-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2347,7 +2347,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; ; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS @@ -2363,7 +2363,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc @@ -2373,7 +2373,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2386,7 +2386,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX11-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc @@ -2396,7 +2396,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS @@ -2406,7 +2406,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2423,7 +2423,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000 @@ -2435,7 +2435,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2448,7 +2448,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2461,7 +2461,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS @@ -2471,7 +2471,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2482,7 +2482,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2500,7 +2500,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xffffc000 @@ -2512,7 +2512,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2525,7 +2525,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2538,7 +2538,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS @@ -2548,7 +2548,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2559,7 +2559,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2578,7 +2578,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff @@ -2590,7 +2590,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2603,7 +2603,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2616,7 +2616,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -2629,7 +2629,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -2640,7 +2640,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2652,7 +2652,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2671,7 +2671,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x800 @@ -2683,7 +2683,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2696,7 +2696,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2709,7 +2709,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -2722,7 +2722,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -2733,7 +2733,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2745,7 +2745,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2764,7 +2764,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xfff @@ -2776,7 +2776,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2789,7 +2789,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2802,7 +2802,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -2815,7 +2815,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -2826,7 +2826,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2838,7 +2838,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2857,7 +2857,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1000 @@ -2869,7 +2869,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2882,7 +2882,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2895,7 +2895,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -2908,7 +2908,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -2919,7 +2919,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2931,7 +2931,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2950,7 +2950,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff @@ -2962,7 +2962,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2975,7 +2975,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2988,7 +2988,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3001,7 +3001,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1800, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3012,7 +3012,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3024,7 +3024,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3043,7 +3043,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3055,7 +3055,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3068,7 +3068,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3081,7 +3081,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3094,7 +3094,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3105,7 +3105,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3117,7 +3117,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3136,7 +3136,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff @@ -3148,7 +3148,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff @@ -3160,7 +3160,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x7ff @@ -3172,7 +3172,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff @@ -3184,7 +3184,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x7ff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3204,7 +3204,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x800 @@ -3216,7 +3216,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x800 @@ -3228,7 +3228,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x800 @@ -3240,7 +3240,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 @@ -3252,7 +3252,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x800 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3272,7 +3272,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xfff @@ -3284,7 +3284,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff @@ -3296,7 +3296,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0xfff @@ -3308,7 +3308,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff @@ -3320,7 +3320,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xfff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3340,7 +3340,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3352,7 +3352,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3364,7 +3364,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3376,7 +3376,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 @@ -3388,7 +3388,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1000 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3408,7 +3408,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3420,7 +3420,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3432,7 +3432,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff @@ -3456,7 +3456,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1fff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3476,7 +3476,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3488,7 +3488,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3500,7 +3500,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3512,7 +3512,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 @@ -3524,7 +3524,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x2000 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll index dd0b96fbb49591..938f66ce5c3871 100644 --- a/llvm/test/CodeGen/AMDGPU/omod.ll +++ b/llvm/test/CodeGen/AMDGPU/omod.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; SI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -25,7 +25,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; ; VI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -43,7 +43,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -82,7 +82,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; SI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -99,7 +99,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; ; VI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -117,7 +117,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -131,7 +131,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -156,7 +156,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -173,7 +173,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; ; VI-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -191,7 +191,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -205,7 +205,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -230,7 +230,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #5 { ; SI-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -247,7 +247,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; ; VI-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -265,7 +265,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -279,7 +279,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll index 7e5def00ee7cb4..e798646a8cd205 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll @@ -4,11 +4,11 @@ define amdgpu_kernel void @if_masked_1(i32 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp0_b32 s4, 0 +; GCN-NEXT: s_bitcmp0_b32 s2, 0 ; GCN-NEXT: s_cselect_b32 s2, 22, 33 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: global_store_dword v0, v1, s[0:1] @@ -23,11 +23,11 @@ define amdgpu_kernel void @if_masked_1(i32 %arg, ptr addrspace(1) %p) { define amdgpu_kernel void @if_masked_1024(i32 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_1024: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp0_b32 s4, 10 +; GCN-NEXT: s_bitcmp0_b32 s2, 10 ; GCN-NEXT: s_cselect_b32 s2, 22, 33 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: global_store_dword v0, v1, s[0:1] @@ -42,11 +42,11 @@ define amdgpu_kernel void @if_masked_1024(i32 %arg, ptr addrspace(1) %p) { define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_0x80000000: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN-NEXT: s_load_dword s2, s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp0_b32 s4, 31 +; GCN-NEXT: s_bitcmp0_b32 s2, 31 ; GCN-NEXT: s_cselect_b32 s2, 22, 33 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: global_store_dword v0, v1, s[0:1] @@ -62,15 +62,15 @@ define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p) define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_0x8000000000000000: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s1, s5, 0x80000000 +; GCN-NEXT: s_and_b32 s1, s1, 0x80000000 ; GCN-NEXT: s_cmp_eq_u64 s[0:1], 0 ; GCN-NEXT: s_cselect_b32 s0, 22, 33 ; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: global_store_dword v0, v1, s[6:7] +; GCN-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-NEXT: s_endpgm %and = and i64 %arg, 9223372036854775808 %cmp = icmp eq i64 %and, 0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index 4ee2b8e981f449..3e45a2d0df43d6 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-LABEL: negated_cond: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_mov_b32 s6, 0 @@ -92,7 +92,7 @@ bb4: define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1) { ; GCN-LABEL: negated_cond_dominated_blocks: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s6, 0 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index eff80236d98663..0adcb73422feff 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX8-LABEL: or_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -70,7 +70,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -92,7 +92,7 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX8-LABEL: or_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -143,7 +143,7 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: scalar_or_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -156,7 +156,7 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX8-LABEL: scalar_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -185,40 +185,40 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, i32 %b) { ; GFX6-LABEL: vector_or_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s12, s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dword s12, s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s6 -; GFX6-NEXT: s_mov_b32 s1, s7 -; GFX6-NEXT: s_mov_b32 s3, s11 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 +; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: vector_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s12, s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s12, s[4:5], 0x34 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s10, s6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s0, s6 -; GFX8-NEXT: s_mov_b32 s1, s7 -; GFX8-NEXT: s_mov_b32 s3, s11 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: s_mov_b32 s11, s7 +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: vector_or_i32: @@ -246,24 +246,24 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: scalar_or_literal_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b32 s4, s4, 0x1869f +; GFX6-NEXT: s_or_b32 s4, s6, 0x1869f ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_literal_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b32 s4, s4, 0x1869f +; GFX8-NEXT: s_or_b32 s4, s6, 0x1869f ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -286,29 +286,29 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_literal_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b32 s5, s5, 0xf237b -; GFX6-NEXT: s_or_b32 s4, s4, 0x3039 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_or_b32 s4, s7, 0xf237b +; GFX6-NEXT: s_or_b32 s5, s6, 0x3039 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_literal_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b32 s5, s5, 0xf237b -; GFX8-NEXT: s_or_b32 s4, s4, 0x3039 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_or_b32 s4, s7, 0xf237b +; GFX8-NEXT: s_or_b32 s5, s6, 0x3039 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -332,20 +332,20 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; GFX6-LABEL: scalar_or_literal_multi_use_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d ; GFX6-NEXT: s_movk_i32 s8, 0x3039 ; GFX6-NEXT: s_mov_b32 s9, 0xf237b ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6-NEXT: s_add_u32 s0, s6, 0x3039 -; GFX6-NEXT: s_addc_u32 s1, s7, 0xf237b +; GFX6-NEXT: s_add_u32 s0, s4, 0x3039 +; GFX6-NEXT: s_addc_u32 s1, s5, 0xf237b ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -355,23 +355,23 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; ; GFX8-LABEL: scalar_or_literal_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74 ; GFX8-NEXT: s_movk_i32 s8, 0x3039 ; GFX8-NEXT: s_mov_b32 s9, 0xf237b -; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_add_u32 s0, s2, 0x3039 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX8-NEXT: s_addc_u32 s1, s3, 0xf237b +; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_add_u32 s0, s4, 0x3039 +; GFX8-NEXT: s_addc_u32 s1, s5, 0xf237b ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm ; @@ -408,27 +408,27 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_inline_imm_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b32 s4, s4, 63 +; GFX6-NEXT: s_or_b32 s4, s6, 63 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_inline_imm_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b32 s4, s4, 63 +; GFX8-NEXT: s_or_b32 s4, s6, 63 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -451,44 +451,44 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b32 s2, s6, 63 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: s_add_u32 s0, s0, 63 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 -; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_or_b32 s0, s2, 63 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_add_u32 s0, s8, 63 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_addc_u32 s1, s9, 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b32 s2, s6, 63 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_add_u32 s0, s0, 63 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_or_b32 s0, s2, 63 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_add_u32 s0, s8, 63 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm ; @@ -521,26 +521,26 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_neg_inline_imm_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b32 s4, s4, -8 +; GFX6-NEXT: s_or_b32 s4, s6, -8 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_neg_inline_imm_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_mov_b32_e32 v1, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b32 s4, s4, -8 +; GFX8-NEXT: s_or_b32 s4, s6, -8 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -565,7 +565,7 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_literal_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -583,7 +583,7 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: vector_or_literal_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -624,7 +624,7 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_inline_immediate_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -642,7 +642,7 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; ; GFX8-LABEL: vector_or_inline_immediate_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -683,32 +683,32 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: scalar_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s0, s4 -; GFX8-NEXT: s_mov_b32 s1, s5 -; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: scalar_or_i64: @@ -730,48 +730,48 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 -; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s12, s6 -; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s10 -; GFX6-NEXT: s_mov_b32 s15, s11 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s12, s2 +; GFX6-NEXT: s_mov_b32 s13, s3 +; GFX6-NEXT: s_mov_b32 s14, s6 +; GFX6-NEXT: s_mov_b32 s15, s7 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: vector_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 -; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s10, s6 +; GFX8-NEXT: s_mov_b32 s11, s7 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s12, s6 -; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s10 -; GFX8-NEXT: s_mov_b32 s15, s11 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s12, s2 +; GFX8-NEXT: s_mov_b32 s13, s3 +; GFX8-NEXT: s_mov_b32 s14, s6 +; GFX8-NEXT: s_mov_b32 s15, s7 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: vector_or_i64: @@ -803,42 +803,42 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, i64 %b) { ; GFX6-LABEL: scalar_vector_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s6 -; GFX6-NEXT: s_mov_b32 s1, s7 -; GFX6-NEXT: s_mov_b32 s3, s11 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s8, s2 +; GFX6-NEXT: s_mov_b32 s9, s3 +; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s13, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_vector_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s10, s6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s0, s6 -; GFX8-NEXT: s_mov_b32 s1, s7 -; GFX8-NEXT: s_mov_b32 s3, s11 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: s_mov_b32 s11, s7 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s13, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: scalar_vector_or_i64: @@ -867,7 +867,7 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_loadimm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -886,7 +886,7 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: vector_or_i64_loadimm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -931,7 +931,7 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -949,7 +949,7 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: vector_or_i64_imm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -990,7 +990,7 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_neg_inline_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; ; GFX8-LABEL: vector_or_i64_neg_inline_imm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -1053,7 +1053,7 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_neg_literal: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: vector_or_i64_neg_literal: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -1116,26 +1116,26 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; GFX6-LABEL: trunc_i64_or_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x13 -; GFX6-NEXT: s_load_dword s5, s[2:3], 0x1d -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x13 +; GFX6-NEXT: s_load_dword s7, s[4:5], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: s_or_b32 s4, s7, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: trunc_i64_or_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX8-NEXT: s_load_dword s5, s[2:3], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x4c +; GFX8-NEXT: s_load_dword s7, s[4:5], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NEXT: s_or_b32 s4, s7, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -1159,21 +1159,21 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; GFX6-LABEL: or_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 -; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s12, s6 -; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s10 -; GFX6-NEXT: s_mov_b32 s15, s11 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s12, s2 +; GFX6-NEXT: s_mov_b32 s13, s3 +; GFX6-NEXT: s_mov_b32 s14, s6 +; GFX6-NEXT: s_mov_b32 s15, s7 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(1) ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1181,26 +1181,26 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX6-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: or_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 -; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s10, s6 +; GFX8-NEXT: s_mov_b32 s11, s7 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s12, s6 -; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s10 -; GFX8-NEXT: s_mov_b32 s15, s11 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s12, s2 +; GFX8-NEXT: s_mov_b32 s13, s3 +; GFX8-NEXT: s_mov_b32 s14, s6 +; GFX8-NEXT: s_mov_b32 s15, s7 +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX8-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: or_i1: @@ -1244,34 +1244,34 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GFX6-LABEL: s_or_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_cmp_eq_u32 s4, s5 -; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s6, s7 -; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX6-NEXT: s_cmp_eq_u32 s0, s1 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s2, s3 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_or_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_eq_u32 s4, s5 -; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s6, s7 -; GFX8-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX8-NEXT: s_cmp_eq_u32 s0, s1 +; GFX8-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, s3 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: s_or_i1: diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll index 5792fab7011afe..da6120812ac1da 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX8-LABEL: s_pack_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX7-LABEL: s_pack_v2f16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -64,7 +64,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2f16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -76,7 +76,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX8-LABEL: s_pack_v2f16_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -89,7 +89,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX7-LABEL: s_pack_v2f16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { ; GFX9-LABEL: s_pack_v2f16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -125,7 +125,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX8-LABEL: s_pack_v2f16_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -138,7 +138,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX7-LABEL: s_pack_v2f16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -162,7 +162,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -178,7 +178,7 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX8-LABEL: v_pack_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -200,7 +200,7 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX7-LABEL: v_pack_v2f16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -240,7 +240,7 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -258,7 +258,7 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX8-LABEL: v_pack_v2f16_user: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -282,7 +282,7 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX7-LABEL: v_pack_v2f16_user: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -324,7 +324,7 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -339,7 +339,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX8-LABEL: v_pack_v2f16_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -356,7 +356,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX7-LABEL: v_pack_v2f16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -386,7 +386,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16_inline_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -401,7 +401,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX8-LABEL: v_pack_v2f16_inline_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -418,7 +418,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX7-LABEL: v_pack_v2f16_inline_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -448,7 +448,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2f16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -463,7 +463,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX8-LABEL: v_pack_v2f16_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -480,7 +480,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX7-LABEL: v_pack_v2f16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -510,7 +510,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2f16_inline_f16imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -525,7 +525,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) ; ; GFX8-LABEL: v_pack_v2f16_inline_f16imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -542,7 +542,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) ; ; GFX7-LABEL: v_pack_v2f16_inline_f16imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -572,7 +572,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2f16_inline_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -586,7 +586,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX8-LABEL: v_pack_v2f16_inline_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -603,7 +603,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX7-LABEL: v_pack_v2f16_inline_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll index 529e64715500dd..44128f0e0dcd8a 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX803-LABEL: s_pack_v2i16: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX803-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX7-LABEL: s_pack_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -62,7 +62,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2i16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -74,7 +74,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX803-LABEL: s_pack_v2i16_imm_lo: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) @@ -87,7 +87,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX7-LABEL: s_pack_v2i16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -110,7 +110,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { ; GFX9-LABEL: s_pack_v2i16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -122,7 +122,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX803-LABEL: s_pack_v2i16_imm_hi: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) @@ -135,7 +135,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX7-LABEL: s_pack_v2i16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -158,7 +158,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -174,7 +174,7 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX803-LABEL: v_pack_v2i16: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -196,7 +196,7 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX7-LABEL: v_pack_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -234,7 +234,7 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -252,7 +252,7 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX803-LABEL: v_pack_v2i16_user: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -276,7 +276,7 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX7-LABEL: v_pack_v2i16_user: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -316,7 +316,7 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -331,7 +331,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX803-LABEL: v_pack_v2i16_imm_lo: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -348,7 +348,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX7-LABEL: v_pack_v2i16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -377,7 +377,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16_inline_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -391,7 +391,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX803-LABEL: v_pack_v2i16_inline_imm_lo: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX7-LABEL: v_pack_v2i16_inline_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -437,7 +437,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2i16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -452,7 +452,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX803-LABEL: v_pack_v2i16_imm_hi: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -469,7 +469,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX7-LABEL: v_pack_v2i16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -498,7 +498,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2i16_inline_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -512,7 +512,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX803-LABEL: v_pack_v2i16_inline_imm_hi: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -529,7 +529,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX7-LABEL: v_pack_v2i16_inline_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll index 078b133a93d6f3..5f2bd53dc91a36 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -15,9 +15,9 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, ; GCN-NEXT: s_mov_b32 s93, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s94, -1 ; GCN-NEXT: s_mov_b32 s95, 0xe8f000 -; GCN-NEXT: s_add_u32 s92, s92, s9 +; GCN-NEXT: s_add_u32 s92, s92, s11 ; GCN-NEXT: s_addc_u32 s93, s93, 0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND @@ -476,9 +476,9 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s54, -1 ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 -; GCN-NEXT: s_add_u32 s52, s52, s9 +; GCN-NEXT: s_add_u32 s52, s52, s11 ; GCN-NEXT: s_addc_u32 s53, s53, 0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND @@ -719,9 +719,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s54, -1 ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 -; GCN-NEXT: s_add_u32 s52, s52, s9 +; GCN-NEXT: s_add_u32 s52, s52, s11 ; GCN-NEXT: s_addc_u32 s53, s53, 0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -955,9 +955,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { ; GCN-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s54, -1 ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 -; GCN-NEXT: s_add_u32 s52, s52, s9 +; GCN-NEXT: s_add_u32 s52, s52, s11 ; GCN-NEXT: s_addc_u32 s53, s53, 0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll index 560f0a06798102..d97d83e80618c6 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll @@ -97,7 +97,7 @@ define <2 x i16> @trunc_srl_v2i64_16_to_v2i16(<2 x i64> %x) { define amdgpu_kernel void @s_trunc_srl_i64_16_to_i16(i64 %x) { ; GCN-LABEL: s_trunc_srl_i64_16_to_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s0, 4 diff --git a/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll b/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll index 8f450e5bcb83f3..d21674d1b3a62e 100644 --- a/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll @@ -4,10 +4,10 @@ declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) ; OBJ-LABEL: : -; OBJ: v_permlane16_b32 v0, v0, s5, s6 op_sel:[1,0] +; OBJ: v_permlane16_b32 v0, v0, s1, s2 op_sel:[1,0] ; ASM-LABEL: permlane_op_sel: -; ASM: v_permlane16_b32 v0, v0, s5, s6 op_sel:[1,0] ; encoding: [0x00,0x08,0x77,0xd7,0x00,0x0b,0x18,0x00] +; ASM: v_permlane16_b32 v0, v0, s1, s2 op_sel:[1,0] ; encoding: [0x00,0x08,0x77,0xd7,0x00,0x03,0x08,0x00] define amdgpu_kernel void @permlane_op_sel(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0) store i32 %v, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll index 69ddc9a48dbc43..cac983a3acfb37 100644 --- a/llvm/test/CodeGen/AMDGPU/permute.ll +++ b/llvm/test/CodeGen/AMDGPU/permute.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @lsh8_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsh8_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x6050400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -31,8 +31,8 @@ bb: define amdgpu_kernel void @lsr24_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsr24_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7060503 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -58,8 +58,8 @@ bb: define amdgpu_kernel void @and_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_lsr24: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7060503 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -87,8 +87,8 @@ bb: define amdgpu_kernel void @and_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7020500 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -115,8 +115,8 @@ bb: define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsh8_or_lsr24: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x2010007 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -142,8 +142,8 @@ bb: define amdgpu_kernel void @lsh16_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsh16_or_lsr24: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x5040c03 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -169,8 +169,8 @@ bb: define amdgpu_kernel void @and_xor_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_xor_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7020104 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -197,8 +197,8 @@ bb: define amdgpu_kernel void @and_or_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -227,8 +227,8 @@ bb: define amdgpu_kernel void @and_or_and_shl(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_and_shl: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x50c0c00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -255,8 +255,8 @@ bb: define amdgpu_kernel void @or_and_or(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: or_and_or: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7020104 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -283,8 +283,8 @@ bb: define amdgpu_kernel void @known_ffff0500(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_ffff0500: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, 0xffff8004 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -323,8 +323,8 @@ bb: define amdgpu_kernel void @known_050c0c00(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_050c0c00: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, 0x50c0c00 ; GCN-NEXT: v_mov_b32_e32 v6, 4 @@ -359,8 +359,8 @@ bb: define amdgpu_kernel void @known_ffff8004(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_ffff8004: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, 0xffff0500 ; GCN-NEXT: v_mov_b32_e32 v6, 0xffff8004 diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 57f5473749513f..37bf8516403bf5 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -609,68 +609,68 @@ define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %i ; GFX10-LABEL: shuffle8i8: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80008 -; GFX10-NEXT: s_lshl_b32 s5, s9, 8 +; GFX10-NEXT: s_bfe_u32 s2, s5, 0x80008 +; GFX10-NEXT: s_lshl_b32 s1, s9, 8 ; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100010 -; GFX10-NEXT: s_bfe_u32 s4, s2, 0x80008 -; GFX10-NEXT: s_lshl_b32 s6, s8, 8 -; GFX10-NEXT: s_and_b32 s7, s8, 0xff00 -; GFX10-NEXT: s_bfe_u32 s8, s2, 0x80010 -; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_or_b32 s3, s3, s5 -; GFX10-NEXT: s_lshl_b32 s5, s9, 8 -; GFX10-NEXT: s_or_b32 s4, s4, s6 -; GFX10-NEXT: s_or_b32 s6, s8, s7 -; GFX10-NEXT: s_or_b32 s2, s2, s5 -; GFX10-NEXT: s_and_b32 s4, s4, 0xffff -; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: s_bfe_u32 s0, s4, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s8, 8 +; GFX10-NEXT: s_and_b32 s5, s8, 0xff00 +; GFX10-NEXT: s_bfe_u32 s8, s4, 0x80010 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_or_b32 s1, s2, s1 +; GFX10-NEXT: s_lshl_b32 s2, s9, 8 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s3, s8, s5 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff -; GFX10-NEXT: s_lshl_b32 s5, s6, 16 -; GFX10-NEXT: s_or_b32 s3, s4, s3 -; GFX10-NEXT: s_or_b32 s2, s2, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_or_b32 s1, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: shuffle8i8: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x80008 -; GFX9-NEXT: s_lshl_b32 s6, s8, 8 -; GFX9-NEXT: s_lshl_b32 s5, s9, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80008 -; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: s_bfe_u32 s6, s9, 0x100010 -; GFX9-NEXT: s_and_b32 s7, s8, 0xff00 -; GFX9-NEXT: s_or_b32 s3, s3, s5 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX9-NEXT: s_and_b32 s2, s2, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s7 -; GFX9-NEXT: s_or_b32 s2, s2, s6 -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s3, s3, 16 +; GFX9-NEXT: s_bfe_u32 s0, s4, 0x80008 +; GFX9-NEXT: s_lshl_b32 s1, s9, 8 +; GFX9-NEXT: s_bfe_u32 s2, s5, 0x80008 +; GFX9-NEXT: s_lshl_b32 s3, s8, 8 +; GFX9-NEXT: s_or_b32 s1, s2, s1 +; GFX9-NEXT: s_or_b32 s0, s0, s3 +; GFX9-NEXT: s_bfe_u32 s2, s4, 0x80010 +; GFX9-NEXT: s_and_b32 s3, s4, 0xff +; GFX9-NEXT: s_bfe_u32 s4, s9, 0x100010 +; GFX9-NEXT: s_and_b32 s5, s8, 0xff00 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_or_b32 s2, s2, s5 -; GFX9-NEXT: s_or_b32 s3, s4, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_or_b32 s3, s3, s4 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm bb: %vec0 = load <8 x i8>, ptr addrspace(1) %in0 diff --git a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll index 40542bc6f05a76..beefc914cc7743 100644 --- a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll +++ b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @dbg_clause(ptr addrspace(1) %out, ptr addrspace(1) %aptr) !dbg !4 { ; GCN-LABEL: dbg_clause: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dword v1, v0, s[6:7] +; GCN-NEXT: global_load_dword v1, v0, s[2:3] ; GCN-NEXT: ;DEBUG_VALUE: foo:a <- $vgpr1 -; GCN-NEXT: global_load_dword v2, v0, s[6:7] offset:32 +; GCN-NEXT: global_load_dword v2, v0, s[2:3] offset:32 ; GCN-NEXT: ;DEBUG_VALUE: foo:b <- $vgpr2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f32_e32 v1, v1, v2 -; GCN-NEXT: global_store_dword v0, v1, s[4:5] +; GCN-NEXT: global_store_dword v0, v1, s[0:1] ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index b94c0cd8f4c892..5b8acc31b22cfd 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -55,22 +55,22 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o ; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-NEXT: ; %bb.0: -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1 ; GFX940-NEXT: s_endpgm ; ; GFX90a-LABEL: no_free_sgprs_block_count_x: ; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-NEXT: ; %bb.0: -; GFX90a-NEXT: s_load_dword s0, s[6:7], 0x28 +; GFX90a-NEXT: s_load_dword s0, s[8:9], 0x28 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[10:11] +; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr @@ -599,24 +599,24 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr ; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-NEXT: ; %bb.0: -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x1c +; GFX940-NEXT: s_load_dword s0, s[4:5], 0x1c ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshr_b32 s0, s0, 16 ; GFX940-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1 ; GFX940-NEXT: s_endpgm ; ; GFX90a-LABEL: no_free_sgprs_preloadremainder_z: ; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-NEXT: ; %bb.0: -; GFX90a-NEXT: s_load_dword s0, s[6:7], 0x1c +; GFX90a-NEXT: s_load_dword s0, s[8:9], 0x1c ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_lshr_b32 s0, s0, 16 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[10:11] +; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index 5dd2dfe9a77d14..1126db9cae93f8 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -17,25 +17,25 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_mov_b64 exec, -1 ; GFX906-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[18:19] -; GFX906-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane ; GFX906-NEXT: s_mov_b32 s21, s15 -; GFX906-NEXT: v_writelane_b32 v39, s6, 0 -; GFX906-NEXT: v_writelane_b32 v39, s7, 1 -; GFX906-NEXT: v_writelane_b32 v39, s21, 2 +; GFX906-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane ; GFX906-NEXT: s_mov_b32 s22, s14 -; GFX906-NEXT: v_writelane_b32 v39, s22, 3 +; GFX906-NEXT: v_writelane_b32 v39, s21, 0 +; GFX906-NEXT: v_writelane_b32 v39, s22, 1 ; GFX906-NEXT: s_mov_b32 s23, s13 -; GFX906-NEXT: v_writelane_b32 v39, s23, 4 +; GFX906-NEXT: v_writelane_b32 v39, s23, 2 ; GFX906-NEXT: s_mov_b32 s24, s12 -; GFX906-NEXT: v_writelane_b32 v39, s24, 5 +; GFX906-NEXT: v_writelane_b32 v39, s24, 3 ; GFX906-NEXT: s_mov_b64 s[26:27], s[10:11] -; GFX906-NEXT: v_writelane_b32 v39, s26, 6 +; GFX906-NEXT: v_writelane_b32 v39, s26, 4 +; GFX906-NEXT: v_writelane_b32 v39, s27, 5 +; GFX906-NEXT: v_writelane_b32 v39, s8, 6 ; GFX906-NEXT: v_writelane_b32 v41, s16, 4 -; GFX906-NEXT: v_writelane_b32 v39, s27, 7 +; GFX906-NEXT: v_writelane_b32 v39, s9, 7 ; GFX906-NEXT: v_writelane_b32 v41, s34, 2 -; GFX906-NEXT: v_writelane_b32 v39, s8, 8 +; GFX906-NEXT: v_writelane_b32 v39, s6, 8 ; GFX906-NEXT: v_writelane_b32 v41, s35, 3 -; GFX906-NEXT: v_writelane_b32 v39, s9, 9 +; GFX906-NEXT: v_writelane_b32 v39, s7, 9 ; GFX906-NEXT: v_writelane_b32 v41, s30, 0 ; GFX906-NEXT: v_writelane_b32 v39, s4, 10 ; GFX906-NEXT: s_addk_i32 s32, 0x2800 @@ -233,19 +233,19 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: v_readlane_b32 s4, v39, 10 -; GFX906-NEXT: v_readlane_b32 s6, v39, 0 -; GFX906-NEXT: v_readlane_b32 s8, v39, 8 -; GFX906-NEXT: v_readlane_b32 s10, v39, 6 +; GFX906-NEXT: v_readlane_b32 s6, v39, 8 +; GFX906-NEXT: v_readlane_b32 s8, v39, 6 +; GFX906-NEXT: v_readlane_b32 s10, v39, 4 ; GFX906-NEXT: v_readlane_b32 s16, v39, 22 -; GFX906-NEXT: v_readlane_b32 s12, v39, 5 +; GFX906-NEXT: v_readlane_b32 s12, v39, 3 ; GFX906-NEXT: v_mov_b32_e32 v31, v40 -; GFX906-NEXT: v_readlane_b32 s13, v39, 4 -; GFX906-NEXT: v_readlane_b32 s14, v39, 3 -; GFX906-NEXT: v_readlane_b32 s15, v39, 2 +; GFX906-NEXT: v_readlane_b32 s13, v39, 2 +; GFX906-NEXT: v_readlane_b32 s14, v39, 1 +; GFX906-NEXT: v_readlane_b32 s15, v39, 0 ; GFX906-NEXT: v_readlane_b32 s5, v39, 11 -; GFX906-NEXT: v_readlane_b32 s7, v39, 1 -; GFX906-NEXT: v_readlane_b32 s9, v39, 9 -; GFX906-NEXT: v_readlane_b32 s11, v39, 7 +; GFX906-NEXT: v_readlane_b32 s7, v39, 9 +; GFX906-NEXT: v_readlane_b32 s9, v39, 7 +; GFX906-NEXT: v_readlane_b32 s11, v39, 5 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -253,18 +253,18 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_readlane_b32 s4, v39, 10 -; GFX906-NEXT: v_readlane_b32 s6, v39, 0 -; GFX906-NEXT: v_readlane_b32 s8, v39, 8 -; GFX906-NEXT: v_readlane_b32 s10, v39, 6 +; GFX906-NEXT: v_readlane_b32 s6, v39, 8 +; GFX906-NEXT: v_readlane_b32 s8, v39, 6 +; GFX906-NEXT: v_readlane_b32 s10, v39, 4 ; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s5, v39, 11 -; GFX906-NEXT: v_readlane_b32 s7, v39, 1 -; GFX906-NEXT: v_readlane_b32 s9, v39, 9 -; GFX906-NEXT: v_readlane_b32 s11, v39, 7 -; GFX906-NEXT: v_readlane_b32 s12, v39, 5 -; GFX906-NEXT: v_readlane_b32 s13, v39, 4 -; GFX906-NEXT: v_readlane_b32 s14, v39, 3 -; GFX906-NEXT: v_readlane_b32 s15, v39, 2 +; GFX906-NEXT: v_readlane_b32 s7, v39, 9 +; GFX906-NEXT: v_readlane_b32 s9, v39, 7 +; GFX906-NEXT: v_readlane_b32 s11, v39, 5 +; GFX906-NEXT: v_readlane_b32 s12, v39, 3 +; GFX906-NEXT: v_readlane_b32 s13, v39, 2 +; GFX906-NEXT: v_readlane_b32 s14, v39, 1 +; GFX906-NEXT: v_readlane_b32 s15, v39, 0 ; GFX906-NEXT: v_mov_b32_e32 v31, v40 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: v_readlane_b32 s21, v39, 12 @@ -403,22 +403,22 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:168 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[16:17] -; GFX908-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane ; GFX908-NEXT: s_mov_b32 s21, s15 -; GFX908-NEXT: v_writelane_b32 v39, s6, 0 -; GFX908-NEXT: v_writelane_b32 v39, s7, 1 -; GFX908-NEXT: v_writelane_b32 v39, s21, 2 +; GFX908-NEXT: ; implicit-def: $vgpr39 : SGPR spill to VGPR lane ; GFX908-NEXT: s_mov_b32 s22, s14 -; GFX908-NEXT: v_writelane_b32 v39, s22, 3 +; GFX908-NEXT: v_writelane_b32 v39, s21, 0 +; GFX908-NEXT: v_writelane_b32 v39, s22, 1 ; GFX908-NEXT: s_mov_b32 s23, s13 -; GFX908-NEXT: v_writelane_b32 v39, s23, 4 +; GFX908-NEXT: v_writelane_b32 v39, s23, 2 ; GFX908-NEXT: s_mov_b32 s24, s12 -; GFX908-NEXT: v_writelane_b32 v39, s24, 5 +; GFX908-NEXT: v_writelane_b32 v39, s24, 3 ; GFX908-NEXT: s_mov_b64 s[26:27], s[10:11] -; GFX908-NEXT: v_writelane_b32 v39, s26, 6 -; GFX908-NEXT: v_writelane_b32 v39, s27, 7 -; GFX908-NEXT: v_writelane_b32 v39, s8, 8 -; GFX908-NEXT: v_writelane_b32 v39, s9, 9 +; GFX908-NEXT: v_writelane_b32 v39, s26, 4 +; GFX908-NEXT: v_writelane_b32 v39, s27, 5 +; GFX908-NEXT: v_writelane_b32 v39, s8, 6 +; GFX908-NEXT: v_writelane_b32 v39, s9, 7 +; GFX908-NEXT: v_writelane_b32 v39, s6, 8 +; GFX908-NEXT: v_writelane_b32 v39, s7, 9 ; GFX908-NEXT: v_writelane_b32 v39, s4, 10 ; GFX908-NEXT: v_mov_b32_e32 v32, v31 ; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill @@ -612,19 +612,19 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: v_readlane_b32 s4, v39, 10 -; GFX908-NEXT: v_readlane_b32 s6, v39, 0 -; GFX908-NEXT: v_readlane_b32 s8, v39, 8 -; GFX908-NEXT: v_readlane_b32 s10, v39, 6 +; GFX908-NEXT: v_readlane_b32 s6, v39, 8 +; GFX908-NEXT: v_readlane_b32 s8, v39, 6 +; GFX908-NEXT: v_readlane_b32 s10, v39, 4 ; GFX908-NEXT: v_readlane_b32 s16, v39, 22 -; GFX908-NEXT: v_readlane_b32 s12, v39, 5 +; GFX908-NEXT: v_readlane_b32 s12, v39, 3 ; GFX908-NEXT: v_mov_b32_e32 v31, v40 -; GFX908-NEXT: v_readlane_b32 s13, v39, 4 -; GFX908-NEXT: v_readlane_b32 s14, v39, 3 -; GFX908-NEXT: v_readlane_b32 s15, v39, 2 +; GFX908-NEXT: v_readlane_b32 s13, v39, 2 +; GFX908-NEXT: v_readlane_b32 s14, v39, 1 +; GFX908-NEXT: v_readlane_b32 s15, v39, 0 ; GFX908-NEXT: v_readlane_b32 s5, v39, 11 -; GFX908-NEXT: v_readlane_b32 s7, v39, 1 -; GFX908-NEXT: v_readlane_b32 s9, v39, 9 -; GFX908-NEXT: v_readlane_b32 s11, v39, 7 +; GFX908-NEXT: v_readlane_b32 s7, v39, 9 +; GFX908-NEXT: v_readlane_b32 s9, v39, 7 +; GFX908-NEXT: v_readlane_b32 s11, v39, 5 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -632,18 +632,18 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readlane_b32 s4, v39, 10 -; GFX908-NEXT: v_readlane_b32 s6, v39, 0 -; GFX908-NEXT: v_readlane_b32 s8, v39, 8 -; GFX908-NEXT: v_readlane_b32 s10, v39, 6 +; GFX908-NEXT: v_readlane_b32 s6, v39, 8 +; GFX908-NEXT: v_readlane_b32 s8, v39, 6 +; GFX908-NEXT: v_readlane_b32 s10, v39, 4 ; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s5, v39, 11 -; GFX908-NEXT: v_readlane_b32 s7, v39, 1 -; GFX908-NEXT: v_readlane_b32 s9, v39, 9 -; GFX908-NEXT: v_readlane_b32 s11, v39, 7 -; GFX908-NEXT: v_readlane_b32 s12, v39, 5 -; GFX908-NEXT: v_readlane_b32 s13, v39, 4 -; GFX908-NEXT: v_readlane_b32 s14, v39, 3 -; GFX908-NEXT: v_readlane_b32 s15, v39, 2 +; GFX908-NEXT: v_readlane_b32 s7, v39, 9 +; GFX908-NEXT: v_readlane_b32 s9, v39, 7 +; GFX908-NEXT: v_readlane_b32 s11, v39, 5 +; GFX908-NEXT: v_readlane_b32 s12, v39, 3 +; GFX908-NEXT: v_readlane_b32 s13, v39, 2 +; GFX908-NEXT: v_readlane_b32 s14, v39, 1 +; GFX908-NEXT: v_readlane_b32 s15, v39, 0 ; GFX908-NEXT: v_mov_b32_e32 v31, v40 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: v_readlane_b32 s21, v39, 12 diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll index 0d88466fc31b3e..e7b405d7d92707 100644 --- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -513,12 +513,12 @@ define amdgpu_kernel void @alloca_promote_atomicrmw_private_lds_promote(ptr addr ; ; GCN-LABEL: alloca_promote_atomicrmw_private_lds_promote: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s6, 1 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -555,12 +555,12 @@ define amdgpu_kernel void @alloca_promote_cmpxchg_private(ptr addrspace(1) %out, ; ; GCN-LABEL: alloca_promote_cmpxchg_private: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s6, 1 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index ad1f790457de97..98d5f3097153d9 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -14,13 +14,13 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s11 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -95,15 +95,15 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -165,20 +165,20 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s11 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 ; GFX10-NEXT: s_getpc_b64 s[0:1] ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v1 @@ -238,7 +238,7 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -346,13 +346,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s11 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -473,15 +473,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX900-NEXT: s_mov_b32 s38, -1 ; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s9 +; GFX900-NEXT: s_add_u32 s36, s36, s11 ; GFX900-NEXT: s_addc_u32 s37, s37, 0 ; GFX900-NEXT: s_getpc_b64 s[0:1] ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_mov_b32 s32, 0 @@ -589,20 +589,20 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s11 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 ; GFX10-NEXT: s_getpc_b64 s[0:1] ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 17, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -701,15 +701,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX90A-NEXT: s_mov_b32 s38, -1 ; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s9 +; GFX90A-NEXT: s_add_u32 s36, s36, s11 ; GFX90A-NEXT: s_addc_u32 s37, s37, 0 ; GFX90A-NEXT: s_getpc_b64 s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s32, 0 @@ -816,7 +816,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -1033,13 +1033,13 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s11 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1119,15 +1119,15 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -1176,20 +1176,20 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s11 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 ; GFX10-NEXT: s_getpc_b64 s[0:1] ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 2 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 @@ -1243,7 +1243,7 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -1348,13 +1348,13 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s11 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1401,15 +1401,15 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -1450,20 +1450,20 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s11 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 ; GFX10-NEXT: s_getpc_b64 s[0:1] ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_and_b32_e32 v12, 0xffff8000, v1 @@ -1501,7 +1501,7 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -1574,13 +1574,13 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s11 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1624,15 +1624,15 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -1669,20 +1669,20 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s11 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 ; GFX10-NEXT: s_getpc_b64 s[0:1] ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 2 ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff8000, v1 @@ -1714,7 +1714,7 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -1781,13 +1781,13 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX8-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s42, -1 ; GFX8-NEXT: s_mov_b32 s43, 0xe80000 -; GFX8-NEXT: s_add_u32 s40, s40, s9 +; GFX8-NEXT: s_add_u32 s40, s40, s11 ; GFX8-NEXT: s_addc_u32 s41, s41, 0 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[42:43] @@ -1844,15 +1844,15 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s42, -1 ; GFX9-NEXT: s_mov_b32 s43, 0xe00000 -; GFX9-NEXT: s_add_u32 s40, s40, s9 +; GFX9-NEXT: s_add_u32 s40, s40, s11 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] -; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -1903,20 +1903,20 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX10-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s42, -1 ; GFX10-NEXT: s_mov_b32 s43, 0x31c16000 -; GFX10-NEXT: s_add_u32 s40, s40, s9 +; GFX10-NEXT: s_add_u32 s40, s40, s11 ; GFX10-NEXT: s_addc_u32 s41, s41, 0 ; GFX10-NEXT: s_getpc_b64 s[0:1] ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX10-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff8000, v0 ; GFX10-NEXT: v_add_co_u32 v8, s0, s36, v16 @@ -1963,7 +1963,7 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -2051,13 +2051,13 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s11 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2132,15 +2132,15 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -2201,20 +2201,20 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s11 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 ; GFX10-NEXT: s_getpc_b64 s[0:1] ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v1 @@ -2278,7 +2278,7 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -2387,13 +2387,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s11 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2429,15 +2429,15 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 @@ -2470,20 +2470,20 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s11 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 ; GFX10-NEXT: s_getpc_b64 s[0:1] ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 @@ -2512,7 +2512,7 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -2565,7 +2565,7 @@ entry: define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX8-LABEL: negativeoffsetnullptr: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s1, s[2:3], 0xec +; GFX8-NEXT: s_load_dword s1, s[4:5], 0xec ; GFX8-NEXT: s_add_u32 s0, 0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_addc_u32 s1, s1, -1 diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index f916c9375bc6de..e2f4d1c6e57bcb 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -5,28 +5,28 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr addrspace(8) noalias %b) { ; SDAG-LABEL: buffers_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 ; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1 ; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2 ; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3 -; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: buffers_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 ; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1 ; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 -; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GISEL-NEXT: s_endpgm %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0) %s0 = fmul float %l0, %l0 @@ -50,40 +50,40 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) { ; SDAG-LABEL: buffers_from_flat_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; SDAG-NEXT: s_mov_b32 s3, 0 -; SDAG-NEXT: s_mov_b32 s2, 16 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; SDAG-NEXT: s_mov_b32 s7, 0 +; SDAG-NEXT: s_mov_b32 s6, 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: s_and_b32 s1, s5, 0xffff -; SDAG-NEXT: s_mov_b32 s0, s4 -; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; SDAG-NEXT: s_and_b32 s1, s7, 0xffff -; SDAG-NEXT: s_mov_b32 s0, s6 +; SDAG-NEXT: s_and_b32 s5, s1, 0xffff +; SDAG-NEXT: s_mov_b32 s4, s0 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: s_and_b32 s5, s3, 0xffff +; SDAG-NEXT: s_mov_b32 s4, s2 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 ; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1 ; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2 ; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3 -; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: buffers_from_flat_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GISEL-NEXT: s_mov_b32 s3, 0 -; GISEL-NEXT: s_mov_b32 s2, 16 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GISEL-NEXT: s_mov_b32 s7, 0 +; GISEL-NEXT: s_mov_b32 s6, 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s1, s5, 0xffff -; GISEL-NEXT: s_mov_b32 s0, s4 -; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GISEL-NEXT: s_and_b32 s1, s7, 0xffff -; GISEL-NEXT: s_mov_b32 s0, s6 +; GISEL-NEXT: s_and_b32 s5, s1, 0xffff +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: s_and_b32 s5, s3, 0xffff +; GISEL-NEXT: s_mov_b32 s4, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 ; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1 ; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 -; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GISEL-NEXT: s_endpgm %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0) %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0) @@ -110,46 +110,46 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) { ; SDAG-LABEL: buffers_might_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 -; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4 +; SDAG-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 -; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 -; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; SDAG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 +; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 -; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8 -; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12 +; SDAG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8 +; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:12 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 -; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:12 +; SDAG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:12 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: buffers_might_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GISEL-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 -; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4 +; GISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GISEL-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 -; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 -; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 +; GISEL-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 -; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8 -; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12 +; GISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8 +; GISEL-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 -; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:12 +; GISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:12 ; GISEL-NEXT: s_endpgm %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0) %s0 = fmul float %l0, %l0 @@ -173,28 +173,28 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) { ; SDAG-LABEL: independent_offsets: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen offset:4 +; SDAG-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen +; SDAG-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; SDAG-NEXT: s_waitcnt vmcnt(1) -; SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen offset:8 +; SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: independent_offsets: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GISEL-NEXT: v_mov_b32_e32 v2, 1.0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen offset:4 +; GISEL-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen +; GISEL-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GISEL-NEXT: s_waitcnt vmcnt(1) -; GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen offset:8 +; GISEL-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 ; GISEL-NEXT: s_endpgm %lane = call i32 @llvm.amdgcn.workitem.id.x() %idx = shl i32 %lane, 2 diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll index 92465420a1ae73..997c61a27c4392 100644 --- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll +++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -748,21 +748,21 @@ define float @v_rcp_neg_fabs_f32_daz_ulp25(float %x) #0 { define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -800,21 +800,21 @@ define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -852,21 +852,21 @@ define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -904,21 +904,21 @@ define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, f define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -956,21 +956,21 @@ define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, f define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #2 { ; SI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1008,21 +1008,21 @@ define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, |s4| +; SI-NEXT: v_rcp_f32_e64 v0, |s2| +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fabs_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, |s4| +; VI-NEXT: v_rcp_f32_e64 v2, |s2| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1061,21 +1061,21 @@ define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float % define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_neg_rcp_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, -s4 +; SI-NEXT: v_rcp_f32_e64 v0, -s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_neg_rcp_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, -s4 +; VI-NEXT: v_rcp_f32_e64 v2, -s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1116,21 +1116,21 @@ define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %s define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, -|s4| +; SI-NEXT: v_rcp_f32_e64 v0, -|s2| +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, -|s4| +; VI-NEXT: v_rcp_f32_e64 v2, -|s2| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1173,13 +1173,13 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, fl define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, -|s4| -; SI-NEXT: v_mul_f32_e64 v1, s4, -|s4| +; SI-NEXT: v_rcp_f32_e64 v0, -|s6| +; SI-NEXT: v_mul_f32_e64 v1, s6, -|s6| ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -1188,13 +1188,13 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1 ; ; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, -|s4| +; VI-NEXT: v_rcp_f32_e64 v2, -|s2| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mul_f32_e64 v3, s4, -|s4| +; VI-NEXT: v_mul_f32_e64 v3, s2, -|s2| ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 @@ -1243,21 +1243,21 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1 define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 { ; SI-LABEL: s_div_arcp_2_x_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, s4, 0.5 +; SI-NEXT: v_mul_f32_e64 v0, s6, 0.5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_div_arcp_2_x_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, s4, 0.5 +; VI-NEXT: v_mul_f32_e64 v2, s2, 0.5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1297,23 +1297,23 @@ define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 { ; SI-LABEL: s_div_arcp_k_x_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-NEXT: v_mul_f32_e32 v0, s6, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_div_arcp_k_x_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v2, s4, v0 +; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1354,23 +1354,23 @@ define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 { ; SI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-NEXT: v_mul_f32_e32 v0, s6, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v2, s4, v0 +; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index b1fa85f7c675b7..a995187390806f 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -3212,127 +3212,128 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) { define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %i21, ptr addrspace(1) nocapture noundef writeonly align 4 %arg, i32 noundef %arg1) #1 { ; GFX67-LABEL: compute_mad: ; GFX67: ; %bb.0: ; %bb -; GFX67-NEXT: s_load_dword s0, s[2:3], 0x6 -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX67-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4 -; GFX67-NEXT: s_mov_b32 s7, 0xf000 +; GFX67-NEXT: s_load_dword s0, s[4:5], 0x6 +; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_add_i32 s0, s0, 1 ; GFX67-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, s0, v1 ; GFX67-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-NEXT: s_load_dword s2, s[10:11], 0x1 -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX67-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 ; GFX67-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_and_b32 s2, s2, 0xffff +; GFX67-NEXT: s_load_dword s2, s[14:15], 0x1 +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[12:13], 0x0 ; GFX67-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX67-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v3 -; GFX67-NEXT: s_mul_i32 s6, s6, s2 +; GFX67-NEXT: s_waitcnt lgkmcnt(0) +; GFX67-NEXT: s_and_b32 s2, s2, 0xffff ; GFX67-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX67-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GFX67-NEXT: s_mov_b32 s6, 0 +; GFX67-NEXT: s_mul_i32 s8, s8, s2 +; GFX67-NEXT: v_add_i32_e32 v0, vcc, s8, v0 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX67-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX67-NEXT: v_mov_b32_e32 v2, s1 +; GFX67-NEXT: v_mov_b32_e32 v2, s5 +; GFX67-NEXT: s_mov_b32 s2, 0 ; GFX67-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX67-NEXT: v_add_i32_e32 v3, vcc, v3, v1 ; GFX67-NEXT: v_mul_lo_u32 v4, v3, v1 -; GFX67-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX67-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX67-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX67-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, v4, v3 -; GFX67-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX67-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: compute_mad: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x18 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x18 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s0, s0, 1 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1 -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX8-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[10:11], 0x4 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v3 -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s4, 0xffff -; GFX8-NEXT: s_mul_i32 s6, s6, s1 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_mul_i32 s8, s8, s2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: compute_mad: ; GFX900: ; %bb.0: ; %bb -; GFX900-NEXT: s_load_dword s0, s[2:3], 0x18 +; GFX900-NEXT: s_load_dword s0, s[4:5], 0x18 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_add_i32 s0, s0, 1 ; GFX900-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX900-NEXT: v_add_u32_e32 v2, s0, v1 ; GFX900-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX900-NEXT: v_add_u32_e32 v1, 1, v1 -; GFX900-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_load_dword s4, s[10:11], 0x4 -; GFX900-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX900-NEXT: s_load_dword s9, s[2:3], 0x4 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX900-NEXT: v_mul_lo_u32 v3, v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, s1 +; GFX900-NEXT: v_mov_b32_e32 v5, s7 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_and_b32 s1, s4, 0xffff +; GFX900-NEXT: s_and_b32 s0, s9, 0xffff ; GFX900-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX900-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX900-NEXT: v_add_u32_e32 v2, 1, v3 -; GFX900-NEXT: s_mul_i32 s6, s6, s1 -; GFX900-NEXT: v_add_u32_e32 v0, s6, v0 +; GFX900-NEXT: s_mul_i32 s8, s8, s0 +; GFX900-NEXT: v_add_u32_e32 v0, s8, v0 ; GFX900-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, s3 +; GFX900-NEXT: v_mov_b32_e32 v4, s5 ; GFX900-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX900-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX900-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v3, v[1:2] -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX900-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v3, v[1:2] +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s4, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX900-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] -; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, v1, v[2:3] -; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s0, v3 +; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v1, v[2:3] +; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s6, v3 ; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc ; GFX900-NEXT: global_store_dword v[1:2], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX90A-LABEL: compute_mad: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x18 -; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x18 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_add_i32 s4, s4, 1 -; GFX90A-NEXT: v_mul_lo_u32 v0, s4, v4 -; GFX90A-NEXT: v_add_u32_e32 v1, s4, v0 +; GFX90A-NEXT: s_add_i32 s9, s9, 1 +; GFX90A-NEXT: v_mul_lo_u32 v0, s9, v4 +; GFX90A-NEXT: v_add_u32_e32 v1, s9, v0 ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v4 ; GFX90A-NEXT: v_add_u32_e32 v0, 1, v0 ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v0 @@ -3340,57 +3341,57 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) % ; GFX90A-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX90A-NEXT: v_add_u32_e32 v1, 1, v2 ; GFX90A-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX90A-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX90A-NEXT: s_load_dword s7, s[10:11], 0x4 ; GFX90A-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[2:3], v0, v2, v[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 -; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, v[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, v0, v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s4, s7, 0xffff -; GFX90A-NEXT: s_mul_i32 s6, s6, s4 -; GFX90A-NEXT: v_add_u32_e32 v1, s6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s2, v1 +; GFX90A-NEXT: s_and_b32 s2, s4, 0xffff +; GFX90A-NEXT: s_mul_i32 s8, s8, s2 +; GFX90A-NEXT: v_add_u32_e32 v1, s8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, s1 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, s7 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s6, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX90A-NEXT: global_store_dword v[2:3], v0, off ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: compute_mad: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x18 -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x18 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s0, s0, 1 ; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v2, s0, v1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX10-NEXT: s_load_dword s4, s[10:11], 0x4 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s4, s4, 0xffff +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, v2, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, s6, s4, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, s8, s2, v[0:1] ; GFX10-NEXT: v_mul_lo_u32 v1, v3, v2 -; GFX10-NEXT: v_add_co_u32 v2, s2, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s2 +; GFX10-NEXT: v_add_co_u32 v2, s0, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, s1, 0, s0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], null, v1, v4, v[1:2] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v4, v1, v[4:5] -; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, s4, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s5, v3, vcc_lo ; GFX10-NEXT: global_store_dword v[1:2], v0, off ; GFX10-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index d1e785f8daa0af..017b37af4cdf26 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -21,7 +21,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; SI-LABEL: rotl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s3, 32, s3 @@ -35,7 +35,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotl_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s3, 32, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -47,17 +47,17 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotl_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s0, 32, s7 -; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_sub_i32 s3, 32, s3 +; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s3, 32, s3 @@ -93,62 +93,62 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s7, 32, s7 -; SI-NEXT: s_sub_i32 s6, 32, s6 -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_alignbit_b32 v1, s5, s5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_alignbit_b32 v0, s4, s4, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_sub_i32 s3, 32, s3 +; SI-NEXT: s_sub_i32 s2, 32, s2 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: rotl_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 32, s6 -; GFX8-NEXT: s_sub_i32 s3, 32, s7 +; GFX8-NEXT: s_sub_i32 s2, 32, s2 +; GFX8-NEXT: s_sub_i32 s3, 32, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 +; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: rotl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s2, 32, s7 -; GFX10-NEXT: s_sub_i32 s3, 32, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s3 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_sub_i32 s3, 32, s3 +; GFX10-NEXT: s_sub_i32 s2, 32, s2 +; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 +; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s2, 32, s7 -; GFX11-NEXT: s_sub_i32 s3, 32, s6 -; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s2 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sub_i32 s3, 32, s3 +; GFX11-NEXT: s_sub_i32 s2, 32, s2 +; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 +; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = shl <2 x i32> %x, %y @@ -184,44 +184,44 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; SI-LABEL: rotl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s8, 32, s8 -; SI-NEXT: s_sub_i32 s9, 32, s9 -; SI-NEXT: s_sub_i32 s11, 32, s11 -; SI-NEXT: s_sub_i32 s10, 32, s10 -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_alignbit_b32 v3, s7, s7, v0 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_alignbit_b32 v2, s6, s6, v0 -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_alignbit_b32 v1, s5, s5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_alignbit_b32 v0, s4, s4, v0 +; SI-NEXT: s_sub_i32 s4, 32, s12 +; SI-NEXT: s_sub_i32 s5, 32, s13 +; SI-NEXT: s_sub_i32 s6, 32, s15 +; SI-NEXT: s_sub_i32 s7, 32, s14 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: rotl_v4i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s3, 32, s9 -; GFX8-NEXT: s_sub_i32 s9, 32, s11 -; GFX8-NEXT: s_sub_i32 s2, 32, s8 -; GFX8-NEXT: s_sub_i32 s8, 32, s10 -; GFX8-NEXT: v_mov_b32_e32 v0, s9 -; GFX8-NEXT: v_alignbit_b32 v3, s7, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_alignbit_b32 v2, s6, s6, v0 +; GFX8-NEXT: s_sub_i32 s5, 32, s15 +; GFX8-NEXT: s_sub_i32 s4, 32, s14 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: s_sub_i32 s3, 32, s13 +; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_sub_i32 s2, 32, s12 +; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v0 +; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v0 +; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -229,36 +229,36 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: rotl_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s2, 32, s8 -; GFX10-NEXT: s_sub_i32 s3, 32, s9 -; GFX10-NEXT: s_sub_i32 s8, 32, s11 -; GFX10-NEXT: s_sub_i32 s9, 32, s10 -; GFX10-NEXT: v_alignbit_b32 v3, s7, s7, s8 -; GFX10-NEXT: v_alignbit_b32 v2, s6, s6, s9 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s2 +; GFX10-NEXT: s_sub_i32 s2, 32, s12 +; GFX10-NEXT: s_sub_i32 s3, 32, s13 +; GFX10-NEXT: s_sub_i32 s4, 32, s15 +; GFX10-NEXT: s_sub_i32 s5, 32, s14 +; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s4 +; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s5 +; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s3 +; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s2, 32, s8 -; GFX11-NEXT: s_sub_i32 s3, 32, s9 -; GFX11-NEXT: s_sub_i32 s8, 32, s11 -; GFX11-NEXT: s_sub_i32 s9, 32, s10 -; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s8 -; GFX11-NEXT: v_alignbit_b32 v2, s6, s6, s9 -; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s2 +; GFX11-NEXT: s_sub_i32 s2, 32, s12 +; GFX11-NEXT: s_sub_i32 s3, 32, s13 +; GFX11-NEXT: s_sub_i32 s4, 32, s15 +; GFX11-NEXT: s_sub_i32 s5, 32, s14 +; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s4 +; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s5 +; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s3 +; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 2188a9864faa0d..db56589b799dda 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -19,7 +19,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; SI-LABEL: rotr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -32,7 +32,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotr_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 @@ -43,16 +43,16 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotr_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s7 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 @@ -82,54 +82,54 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_alignbit_b32 v1, s5, s5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_alignbit_b32 v0, s4, s4, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: rotr_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 +; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: rotr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s7 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s6 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 +; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s7 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s6 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 +; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: %tmp0 = sub <2 x i32> , %y @@ -157,36 +157,36 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; SI-LABEL: rotr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_alignbit_b32 v3, s7, s7, v0 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_alignbit_b32 v2, s6, s6, v0 -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_alignbit_b32 v1, s5, s5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_alignbit_b32 v0, s4, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: rotr_v4i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s11 -; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: v_alignbit_b32 v3, s7, s7, v0 -; GFX8-NEXT: v_alignbit_b32 v2, s6, s6, v1 -; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v4 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: v_mov_b32_e32 v4, s13 +; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 +; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1 +; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v0 +; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -194,28 +194,28 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: rotr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s7, s7, s11 -; GFX10-NEXT: v_alignbit_b32 v2, s6, s6, s10 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s9 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s8 +; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s15 +; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s14 +; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s13 +; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s12 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s11 -; GFX11-NEXT: v_alignbit_b32 v2, s6, s6, s10 -; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s9 -; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s8 +; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s15 +; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s14 +; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s13 +; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s12 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll index 40a8592dba6df0..f3c9a5c471aca0 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -20,7 +20,7 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ; GCN-DAZ-UNSAFE-LABEL: rsq_f32: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -38,7 +38,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; GCN-IEEE-UNSAFE-LABEL: rsq_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -56,7 +56,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; GCN-DAZ-SAFE-LABEL: rsq_f32: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; SI-IEEE-SAFE-LABEL: rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -134,7 +134,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; CI-IEEE-SAFE-LABEL: rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -198,40 +198,40 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) { ; GCN-DAZ-UNSAFE-LABEL: rsq_f32_sgpr: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dword s2, s[4:5], 0xb +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, s4 +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm ; ; GCN-IEEE-UNSAFE-LABEL: rsq_f32_sgpr: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dword s2, s[4:5], 0xb +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, s4 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm ; ; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-DAZ-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, -1 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s0 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, -1 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -251,8 +251,8 @@ define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %va ; ; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 @@ -289,8 +289,8 @@ define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %va ; ; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dword s0, s[2:3], 0xb -; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb +; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 @@ -367,7 +367,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-UNSAFE-NEXT: s_endpgm ; GCN-DAZ-UNSAFE-LABEL: rsqrt_fmul: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, 0 ; GCN-DAZ-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -391,7 +391,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-IEEE-UNSAFE-LABEL: rsqrt_fmul: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, 0 ; GCN-IEEE-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -415,7 +415,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-DAZ-SAFE-LABEL: rsqrt_fmul: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0 ; GCN-DAZ-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -466,7 +466,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-IEEE-SAFE-LABEL: rsqrt_fmul: ; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0 ; GCN-IEEE-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -533,7 +533,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ; GCN-DAZ-UNSAFE-LABEL: neg_rsq_f32: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -552,7 +552,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -571,7 +571,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; GCN-DAZ-SAFE-LABEL: neg_rsq_f32: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 @@ -606,7 +606,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; SI-IEEE-SAFE-LABEL: neg_rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -649,7 +649,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; CI-IEEE-SAFE-LABEL: neg_rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -714,7 +714,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ; GCN-DAZ-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -733,7 +733,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -752,7 +752,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 @@ -787,7 +787,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -830,7 +830,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll index 0fe0c57df9d3ad..a2624e5f61307a 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll @@ -83,28 +83,29 @@ define void @func2() { define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; GFX12-SDAG-LABEL: kernel1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0 ; GFX12-SDAG-NEXT: s_barrier_init m0 -; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[2:3], 48 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48 +; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_lshr_b32 s0, s0, 4 +; GFX12-SDAG-NEXT: s_lshr_b32 s2, s2, 4 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_and_b32 s0, s0, 63 +; GFX12-SDAG-NEXT: s_and_b32 s2, s2, 63 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_or_b32 s1, 0x90000, s0 +; GFX12-SDAG-NEXT: s_or_b32 s3, 0x90000, s2 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_mov_b32 m0, s1 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX12-SDAG-NEXT: s_barrier_init m0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002 ; GFX12-SDAG-NEXT: s_barrier_signal m0 -; GFX12-SDAG-NEXT: s_mov_b32 m0, s1 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX12-SDAG-NEXT: s_barrier_signal m0 -; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 ; GFX12-SDAG-NEXT: s_barrier_signal -1 ; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1 ; GFX12-SDAG-NEXT: s_barrier_join m0 @@ -112,46 +113,48 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG-NEXT: s_barrier_wait 1 ; GFX12-SDAG-NEXT: s_barrier_leave ; GFX12-SDAG-NEXT: s_wakeup_barrier m0 -; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 ; GFX12-SDAG-NEXT: s_wakeup_barrier m0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 2 -; GFX12-SDAG-NEXT: s_get_barrier_state s1, m0 -; GFX12-SDAG-NEXT: s_mov_b32 m0, s0 -; GFX12-SDAG-NEXT: s_get_barrier_state s0, m0 +; GFX12-SDAG-NEXT: s_get_barrier_state s3, m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX12-SDAG-NEXT: s_get_barrier_state s2, m0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1] +; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, func1@gotpcrel32@lo+12 +; GFX12-SDAG-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func1@gotpcrel32@lo+12 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, func1@gotpcrel32@hi+24 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func1@gotpcrel32@hi+24 ; GFX12-SDAG-NEXT: s_barrier_signal -1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-SDAG-NEXT: s_barrier_wait -1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1] +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, func2@gotpcrel32@lo+12 +; GFX12-SDAG-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func2@gotpcrel32@lo+12 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, func2@gotpcrel32@hi+24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func2@gotpcrel32@hi+24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX12-SDAG-NEXT: s_get_barrier_state s0, -1 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: kernel1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-GISEL-NEXT: s_mov_b64 s[12:13], s[4:5] ; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX12-GISEL-NEXT: s_load_b32 s0, s[12:13], 0x2c ; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0 ; GFX12-GISEL-NEXT: s_barrier_init m0 +; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshr_b32 s0, s0, 4 @@ -169,7 +172,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_barrier_signal -1 ; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1 ; GFX12-GISEL-NEXT: s_mov_b32 m0, s0 -; GFX12-GISEL-NEXT: s_add_co_u32 s8, s2, 48 +; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48 ; GFX12-GISEL-NEXT: s_barrier_join m0 ; GFX12-GISEL-NEXT: s_barrier_wait 1 ; GFX12-GISEL-NEXT: s_barrier_leave @@ -178,7 +181,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_get_barrier_state s0, 2 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_get_barrier_state s0, m0 -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s3, 0 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe @@ -192,8 +195,8 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX12-GISEL-NEXT: s_add_co_u32 s8, s2, 48 -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s3, 0 +; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0 ; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe ; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1 @@ -229,50 +232,52 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; GFX12-SDAG-LABEL: kernel2: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-SDAG-NEXT: s_getpc_b64 s[4:5] +; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX12-SDAG-NEXT: s_getpc_b64 s[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_sext_i32_i16 s5, s5 -; GFX12-SDAG-NEXT: s_add_co_u32 s4, s4, func2@gotpcrel32@lo+12 +; GFX12-SDAG-NEXT: s_sext_i32_i16 s7, s7 +; GFX12-SDAG-NEXT: s_add_co_u32 s6, s6, func2@gotpcrel32@lo+12 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_add_co_ci_u32 s5, s5, func2@gotpcrel32@hi+24 +; GFX12-SDAG-NEXT: s_add_co_ci_u32 s7, s7, func2@gotpcrel32@hi+24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0 -; GFX12-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX12-SDAG-NEXT: s_load_b64 s[12:13], s[6:7], 0x0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002 -; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[2:3], 48 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_barrier_signal m0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 2 ; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX12-SDAG-NEXT: s_barrier_join m0 ; GFX12-SDAG-NEXT: s_barrier_wait 1 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: kernel2: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_add_co_u32 s8, s2, 48 -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s3, 0 -; GFX12-GISEL-NEXT: s_getpc_b64 s[2:3] +; GFX12-GISEL-NEXT: s_add_co_u32 s8, s4, 48 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s5, 0 +; GFX12-GISEL-NEXT: s_getpc_b64 s[4:5] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_sext_i32_i16 s3, s3 -; GFX12-GISEL-NEXT: s_add_co_u32 s2, s2, func2@gotpcrel32@lo+12 +; GFX12-GISEL-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-GISEL-NEXT: s_add_co_u32 s4, s4, func2@gotpcrel32@lo+12 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s3, s3, func2@gotpcrel32@hi+24 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s5, func2@gotpcrel32@hi+24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-GISEL-NEXT: s_load_b64 s[12:13], s[4:5], 0x0 +; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70002 ; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_join 2 ; GFX12-GISEL-NEXT: s_barrier_wait 1 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-GISEL-NEXT: s_endpgm call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7) call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar) diff --git a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll index 02641f5b6ae8c1..b00e9dacfde36b 100644 --- a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll @@ -5,24 +5,24 @@ define amdgpu_kernel void @s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) { ; GFX6-LABEL: s_mulk_i32_k0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mulk_i32 s4, 0x41 +; GFX6-NEXT: s_mul_i32 s4, s6, 0x41 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_mulk_i32_k0: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mulk_i32 s4, 0x41 +; GFX8-NEXT: s_mul_i32 s4, s6, 0x41 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -34,24 +34,24 @@ define amdgpu_kernel void @s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) { define amdgpu_kernel void @s_mulk_i32_k1(ptr addrspace(1) %out, i32 %b) { ; GFX6-LABEL: s_mulk_i32_k1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mulk_i32 s4, 0x7fff +; GFX6-NEXT: s_mul_i32 s4, s6, 0x7fff ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_mulk_i32_k1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mulk_i32 s4, 0x7fff +; GFX8-NEXT: s_mul_i32 s4, s6, 0x7fff ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -63,24 +63,24 @@ define amdgpu_kernel void @s_mulk_i32_k1(ptr addrspace(1) %out, i32 %b) { define amdgpu_kernel void @s_mulk_i32_k2(ptr addrspace(1) %out, i32 %b) { ; GFX6-LABEL: s_mulk_i32_k2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mulk_i32 s4, 0xffef +; GFX6-NEXT: s_mul_i32 s4, s6, 0xffffffef ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_mulk_i32_k2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mulk_i32 s4, 0xffef +; GFX8-NEXT: s_mul_i32 s4, s6, 0xffffffef ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -92,24 +92,24 @@ define amdgpu_kernel void @s_mulk_i32_k2(ptr addrspace(1) %out, i32 %b) { define amdgpu_kernel void @no_s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) { ; GFX6-LABEL: no_s_mulk_i32_k0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s4, s4, 0x8001 +; GFX6-NEXT: s_mul_i32 s4, s6, 0x8001 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: no_s_mulk_i32_k0: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s4, s4, 0x8001 +; GFX8-NEXT: s_mul_i32 s4, s6, 0x8001 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -123,7 +123,7 @@ define amdgpu_kernel void @no_s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) { define amdgpu_kernel void @commute_s_mulk_i32(ptr addrspace(1) %out, i32 %b) #0 { ; GFX6-LABEL: commute_s_mulk_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x2 +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x2 ; GFX6-NEXT: v_mov_b32_e32 v0, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mulk_i32 s0, 0x800 @@ -134,7 +134,7 @@ define amdgpu_kernel void @commute_s_mulk_i32(ptr addrspace(1) %out, i32 %b) #0 ; ; GFX8-LABEL: commute_s_mulk_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mulk_i32 s0, 0x800 diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index c2132cf907fdb2..2b47095c6cf14a 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -30,8 +30,8 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) { ; GCN-LABEL: v_sad_u32_constant_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s2, s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x5a ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_sad_u32 v2, s2, v0, 20 @@ -55,8 +55,8 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -81,9 +81,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_u32 s16, s16, s15 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 @@ -117,9 +117,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_u32 s16, s16, s15 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 @@ -149,9 +149,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_u32 s16, s16, s15 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s3, s0, s1 @@ -184,9 +184,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_u32 s16, s16, s15 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 @@ -220,9 +220,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_u32 s16, s16, s15 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s1 @@ -253,9 +253,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] ; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_u32 s16, s16, s15 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 @@ -285,24 +285,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc +; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NEXT: v_sad_u32 v3, s11, v0, v1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_sad_u32 v2, s10, v2, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_sad_u32 v1, s9, v0, v1 -; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_sad_u32 v0, s8, v0, v4 -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_sad_u32 v3, s3, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_sad_u32 v2, s2, v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_sad_u32 v1, s1, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: v_sad_u32 v0, s0, v0, v4 +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %icmp0 = icmp ugt <4 x i32> %a, %b @@ -321,24 +321,24 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc +; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NEXT: v_sad_u32 v3, s11, v0, v1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_sad_u32 v2, s10, v2, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_sad_u32 v1, s9, v0, v1 -; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_sad_u32 v0, s8, v0, v4 -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_sad_u32 v3, s3, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_sad_u32 v2, s2, v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_sad_u32 v1, s1, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: v_sad_u32 v0, s0, v0, v4 +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %icmp0 = icmp ugt <4 x i32> %a, %b @@ -355,9 +355,9 @@ define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) { ; GCN-LABEL: v_sad_u32_i16_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s4, s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NEXT: s_lshr_b32 s0, s0, 16 @@ -386,7 +386,7 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: flat_load_ushort v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: flat_load_ushort v1, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_load_ushort v2, v[0:1] glc @@ -414,8 +414,8 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) { ; GCN-LABEL: v_sad_u32_i8_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s2, s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -445,7 +445,7 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: flat_load_ubyte v1, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_load_ubyte v2, v[0:1] glc @@ -473,8 +473,8 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { ; GCN-LABEL: s_sad_u32_i8_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s2, s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -500,8 +500,8 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext % define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GCN-LABEL: v_sad_u32_mismatched_operands_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s6, s0, s1 ; GCN-NEXT: s_cmp_le_u32 s0, s1 @@ -529,8 +529,8 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) % define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GCN-LABEL: v_sad_u32_mismatched_operands_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s3 ; GCN-NEXT: s_sub_i32 s6, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index d8deb810971106..b4eb7750081222 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -15,104 +15,104 @@ declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: saddo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_add_u32 s10, s6, s8 -; SI-NEXT: s_addc_u32 s11, s7, s9 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_add_u32 s10, s2, s8 +; SI-NEXT: s_addc_u32 s11, s3, s9 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] -; SI-NEXT: v_cmp_lt_i64_e64 s[6:7], s[8:9], 0 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[8:9], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, s11 ; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: saddo_i64_zext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: s_add_u32 s2, s6, s0 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: s_addc_u32 s3, s7, s1 -; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_add_u32 s6, s2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: s_addc_u32 s7, s3, s5 +; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[4:5], 0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[1:2] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: saddo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_add_u32 s2, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s3, s7, s1 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_add_u32 s4, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: s_addc_u32 s5, s3, s7 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[6:7], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: saddo_i64_zext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s2, s6, s0 -; GFX10-NEXT: s_addc_u32 s3, s7, s1 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7] -; GFX10-NEXT: s_xor_b32 s0, s0, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_add_u32 s4, s2, s6 +; GFX10-NEXT: s_addc_u32 s5, s3, s7 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] +; GFX10-NEXT: s_xor_b32 s2, s6, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: saddo_i64_zext: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_u32 s2, s6, s0 -; GFX11-NEXT: s_addc_u32 s3, s7, s1 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7] +; GFX11-NEXT: s_add_u32 s6, s2, s4 +; GFX11-NEXT: s_addc_u32 s7, s3, s5 +; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, s0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: s_xor_b32 s2, s4, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 @@ -126,25 +126,25 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_saddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_add_i32 s14, s12, s13 -; SI-NEXT: s_cmp_lt_i32 s13, 0 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_cmp_lt_i32 s14, s12 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_add_i32 s12, s8, s9 +; SI-NEXT: s_cmp_lt_i32 s9, 0 +; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; SI-NEXT: s_cmp_lt_i32 s12, s8 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_xor_b64 s[4:5], s[10:11], s[8:9] +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -152,20 +152,20 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_saddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_add_i32 s4, s0, s1 -; VI-NEXT: s_cmp_lt_i32 s1, 0 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_lt_i32 s4, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_i32 s6, s4, s5 +; VI-NEXT: s_cmp_lt_i32 s5, 0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_cmp_lt_i32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; VI-NEXT: flat_store_byte v[2:3], v0 @@ -173,44 +173,44 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_saddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_add_i32 s1, s0, s1 -; GFX9-NEXT: v_add_i32 v1, s0, v1 clamp -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_add_i32 s4, s6, s7 +; GFX9-NEXT: v_add_i32 v1, s6, v1 clamp +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v2, s[4:5] -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_saddo_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_nc_i32 v0, s0, s1 clamp -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s0, v0 +; GFX10-NEXT: v_add_nc_i32 v0, s6, s7 clamp +; GFX10-NEXT: s_add_i32 s4, s6, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: global_store_dword v1, v2, s[4:5] -; GFX10-NEXT: global_store_byte v1, v0, s[6:7] +; GFX10-NEXT: global_store_dword v1, v2, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_saddo_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_nc_i32 v0, s4, s5 clamp -; GFX11-NEXT: s_add_i32 s4, s4, s5 +; GFX11-NEXT: v_add_nc_i32 v0, s6, s7 clamp +; GFX11-NEXT: s_add_i32 s4, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0 @@ -230,7 +230,7 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -260,7 +260,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_saddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -284,40 +284,40 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_saddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[8:9] -; GFX9-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NEXT: global_load_dword v1, v0, s[12:13] +; GFX9-NEXT: global_load_dword v2, v0, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_i32 v3, v1, v2 clamp ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_saddo_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[8:9] -; GFX10-NEXT: global_load_dword v2, v0, s[10:11] +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_i32 v3, v1, v2 clamp ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] -; GFX10-NEXT: global_store_byte v0, v2, s[6:7] +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_saddo_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -346,7 +346,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) nounwind { ; SI-LABEL: s_saddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -373,7 +373,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_saddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 @@ -395,43 +395,43 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_saddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s8, s10 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s9, s11 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[10:11], 0 +; GFX9-NEXT: s_add_u32 s0, s12, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s13, s15 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[14:15], 0 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: global_store_byte v2, v0, s[6:7] +; GFX9-NEXT: global_store_byte v2, v0, s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_saddo_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s8, s10 -; GFX10-NEXT: s_addc_u32 s1, s9, s11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[8:9] +; GFX10-NEXT: s_add_u32 s0, s12, s14 +; GFX10-NEXT: s_addc_u32 s1, s13, s15 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[14:15], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[12:13] ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_xor_b32 s0, s2, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] -; GFX10-NEXT: global_store_byte v2, v3, s[6:7] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX10-NEXT: global_store_byte v2, v3, s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_saddo_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s8, s4, s6 ; GFX11-NEXT: s_addc_u32 s9, s5, s7 @@ -457,7 +457,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -488,7 +488,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_saddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -513,30 +513,30 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_saddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: global_store_byte v6, v0, s[6:7] +; GFX9-NEXT: global_store_byte v6, v0, s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_saddo_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo @@ -544,13 +544,13 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] ; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] -; GFX10-NEXT: global_store_byte v6, v0, s[6:7] +; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] +; GFX10-NEXT: global_store_byte v6, v0, s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_saddo_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -582,7 +582,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -617,7 +617,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_saddo_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -646,11 +646,11 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_saddo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v5, v1, v3 ; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp @@ -660,18 +660,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_saddo_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[14:15] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, v1, v3 ; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp @@ -681,13 +681,13 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_saddo_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index f299232918d99b..e8f86a6ce63ff6 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: scalar_to_vector_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: scalar_to_vector_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -53,7 +53,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: scalar_to_vector_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -73,7 +73,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: scalar_to_vector_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -226,23 +226,23 @@ bb: define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zeroext %val) nounwind { ; SI-LABEL: scalar_to_vector_test6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_to_vector_test6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll index 89a09dc4fcc171..884ba3fc34dff2 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX900-LABEL: scalar_to_vector_v8i16: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 @@ -22,7 +22,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; ; GFX906-LABEL: scalar_to_vector_v8i16: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 @@ -37,7 +37,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; ; GFX908-LABEL: scalar_to_vector_v8i16: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 @@ -52,7 +52,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; ; GFX90A-LABEL: scalar_to_vector_v8i16: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -86,7 +86,7 @@ entry: define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 { ; GFX900-LABEL: scalar_to_vector_v8f16: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 @@ -101,7 +101,7 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; ; GFX906-LABEL: scalar_to_vector_v8f16: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 @@ -116,7 +116,7 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; ; GFX908-LABEL: scalar_to_vector_v8f16: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 @@ -131,7 +131,7 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; ; GFX90A-LABEL: scalar_to_vector_v8f16: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 7f8240eeb98ebf..0ad10437299f48 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -20,15 +20,15 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ; def s[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 -; CHECK-NEXT: v_writelane_b32 v22, s2, 0 -; CHECK-NEXT: v_writelane_b32 v22, s3, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:7] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v22, s2, 0 +; CHECK-NEXT: v_writelane_b32 v22, s3, 1 ; CHECK-NEXT: v_writelane_b32 v22, s4, 2 ; CHECK-NEXT: v_writelane_b32 v22, s5, 3 ; CHECK-NEXT: v_writelane_b32 v22, s6, 4 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: v_writelane_b32 v22, s7, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index 0dac327a2297c7..268322bd074bfd 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -7,7 +7,7 @@ ; RUN: llc -mtriple=amdgcn -verify-misched < %s | FileCheck --check-prefixes=GENERIC %s ; RUN: llc -mtriple=amdgcn -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GENERIC-GCNTRACKERS %s -; GCN Trackers are sensitive to minor changes in RP, and will avoid scheduling certain instructions, which, if scheduled, +; GCN Trackers are sensitive to minor changes in RP, and will avoid scheduling certain instructions, which, if scheduled, ; allow scheduling of other instructions which reduce RP ; CHECK-LABEL: {{^}}return_72xi32: @@ -73,8 +73,8 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % } ; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure: -; GFX908: NumSgprs: 56 -; GFX908-GCNTRACKERS: NumSgprs: 56 +; GFX908: NumSgprs: 64 +; GFX908-GCNTRACKERS: NumSgprs: 64 ; GFX908: NumVgprs: 43 ; GFX908-GCNTRACKERS: NumVgprs: 39 ; GFX908: Occupancy: 5 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index c10600a78e3e76..6225ff73e28d08 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s10, s2 @@ -60,7 +60,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: sdiv_i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -104,46 +104,46 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s6, v1 -; GFX9-NEXT: s_abs_i32 s7, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_xor_b32 s5, s4, s6 -; GFX9-NEXT: s_sub_i32 s6, 0, s7 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_abs_i32 s5, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_sub_i32 s7, 0, s5 +; GFX9-NEXT: s_xor_b32 s4, s6, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_abs_i32 s4, s4 -; GFX9-NEXT: s_ashr_i32 s5, s5, 31 +; GFX9-NEXT: s_abs_i32 s6, s6 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s6, s8, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s8 -; GFX9-NEXT: s_mul_i32 s8, s6, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_add_i32 s9, s6, 1 -; GFX9-NEXT: s_sub_i32 s8, s4, s7 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s6, s9, s6 -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s8, s6 -; GFX9-NEXT: s_xor_b32 s4, s4, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_i32 s7, s7, s8 +; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8 +; GFX9-NEXT: s_mul_i32 s8, s7, s5 +; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: s_add_i32 s9, s7, 1 +; GFX9-NEXT: s_sub_i32 s8, s6, s5 +; GFX9-NEXT: s_cmp_ge_u32 s6, s5 +; GFX9-NEXT: s_cselect_b32 s7, s9, s7 +; GFX9-NEXT: s_cselect_b32 s6, s8, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s5 +; GFX9-NEXT: s_cselect_b32 s5, s8, s7 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -199,7 +199,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -220,7 +220,7 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: sdiv_i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -241,23 +241,23 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 30, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_i32_4: @@ -293,7 +293,7 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: slow_sdiv_i32_3435: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -316,7 +316,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa ; ; TONGA-LABEL: slow_sdiv_i32_3435: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -339,25 +339,25 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: slow_sdiv_i32_3435: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, 0x98a1930b -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s2, 0x98a1930b +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0 -; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2 ; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 11, v0 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: slow_sdiv_i32_3435: @@ -391,7 +391,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -462,7 +462,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: sdiv_v2i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -533,75 +533,75 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v2 ; GFX9-NEXT: s_abs_i32 s1, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_xor_b32 s0, s7, s0 -; GFX9-NEXT: s_ashr_i32 s8, s0, 31 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_xor_b32 s0, s5, s0 +; GFX9-NEXT: s_ashr_i32 s6, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_sub_i32 s0, 0, s1 -; GFX9-NEXT: s_abs_i32 s7, s7 -; GFX9-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9-NEXT: s_abs_i32 s5, s5 +; GFX9-NEXT: v_readfirstlane_b32 s4, v3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s9, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s9 -; GFX9-NEXT: s_mul_hi_u32 s0, s9, s0 -; GFX9-NEXT: s_add_i32 s9, s9, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s7, s9 -; GFX9-NEXT: s_mul_i32 s9, s0, s1 -; GFX9-NEXT: s_sub_i32 s7, s7, s9 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s7 +; GFX9-NEXT: s_mul_hi_u32 s0, s7, s0 +; GFX9-NEXT: s_add_i32 s7, s7, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s7 +; GFX9-NEXT: s_mul_i32 s7, s0, s1 +; GFX9-NEXT: s_sub_i32 s5, s5, s7 ; GFX9-NEXT: s_add_i32 s10, s0, 1 -; GFX9-NEXT: s_sub_i32 s9, s7, s1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s1 +; GFX9-NEXT: s_sub_i32 s7, s5, s1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s1 ; GFX9-NEXT: s_cselect_b32 s0, s10, s0 -; GFX9-NEXT: s_cselect_b32 s7, s9, s7 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s1 -; GFX9-NEXT: s_cselect_b32 s7, s9, s0 -; GFX9-NEXT: s_abs_i32 s9, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_add_i32 s7, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s1 +; GFX9-NEXT: s_cselect_b32 s5, s7, s0 +; GFX9-NEXT: s_abs_i32 s7, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_xor_b32 s5, s5, s6 +; GFX9-NEXT: s_mov_b32 s1, s9 +; GFX9-NEXT: s_sub_i32 s9, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s4, s6 -; GFX9-NEXT: s_xor_b32 s6, s7, s8 -; GFX9-NEXT: s_sub_i32 s7, 0, s9 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, s6, s8 -; GFX9-NEXT: s_abs_i32 s4, s4 -; GFX9-NEXT: s_ashr_i32 s5, s5, 31 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s7, s7, s8 -; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 -; GFX9-NEXT: s_mul_i32 s8, s7, s9 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_add_i32 s10, s7, 1 -; GFX9-NEXT: s_sub_i32 s8, s4, s9 -; GFX9-NEXT: s_cmp_ge_u32 s4, s9 -; GFX9-NEXT: s_cselect_b32 s7, s10, s7 -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s7, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s9 -; GFX9-NEXT: s_cselect_b32 s4, s8, s7 -; GFX9-NEXT: s_xor_b32 s4, s4, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 +; GFX9-NEXT: s_abs_i32 s8, s8 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s6 +; GFX9-NEXT: s_mul_hi_u32 s9, s6, s9 +; GFX9-NEXT: s_add_i32 s6, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s6, s8, s6 +; GFX9-NEXT: s_mul_i32 s9, s6, s7 +; GFX9-NEXT: s_sub_i32 s8, s8, s9 +; GFX9-NEXT: s_add_i32 s10, s6, 1 +; GFX9-NEXT: s_sub_i32 s9, s8, s7 +; GFX9-NEXT: s_cmp_ge_u32 s8, s7 +; GFX9-NEXT: s_cselect_b32 s6, s10, s6 +; GFX9-NEXT: s_cselect_b32 s8, s9, s8 +; GFX9-NEXT: s_add_i32 s9, s6, 1 +; GFX9-NEXT: s_cmp_ge_u32 s8, s7 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: s_sub_i32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -682,7 +682,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v2i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -707,7 +707,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: sdiv_v2i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -732,17 +732,17 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: sdiv_v2i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1 @@ -752,7 +752,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_v2i32_4: @@ -791,7 +791,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_mov_b32 s6, s10 @@ -918,7 +918,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: sdiv_v4i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_mov_b32 s11, 0xf000 ; TONGA-NEXT: s_mov_b32 s10, -1 ; TONGA-NEXT: s_mov_b32 s6, s10 @@ -1045,136 +1045,136 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: s_abs_i32 s1, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s7, v4 -; GFX9-NEXT: s_xor_b32 s0, s7, s0 -; GFX9-NEXT: s_ashr_i32 s8, s0, 31 +; GFX9-NEXT: v_readfirstlane_b32 s5, v4 +; GFX9-NEXT: s_xor_b32 s0, s5, s0 +; GFX9-NEXT: s_ashr_i32 s6, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s0, 0, s1 -; GFX9-NEXT: s_abs_i32 s7, s7 -; GFX9-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-NEXT: s_abs_i32 s5, s5 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s9, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s9 -; GFX9-NEXT: s_mul_hi_u32 s0, s9, s0 -; GFX9-NEXT: s_add_i32 s9, s9, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s7, s9 -; GFX9-NEXT: s_mul_i32 s9, s0, s1 -; GFX9-NEXT: s_sub_i32 s7, s7, s9 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s7 +; GFX9-NEXT: s_mul_hi_u32 s0, s7, s0 +; GFX9-NEXT: s_add_i32 s7, s7, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s5, s7 +; GFX9-NEXT: s_mul_i32 s7, s0, s1 +; GFX9-NEXT: s_sub_i32 s5, s5, s7 ; GFX9-NEXT: s_add_i32 s10, s0, 1 -; GFX9-NEXT: s_sub_i32 s9, s7, s1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s1 +; GFX9-NEXT: s_sub_i32 s7, s5, s1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s1 ; GFX9-NEXT: s_cselect_b32 s0, s10, s0 -; GFX9-NEXT: s_cselect_b32 s7, s9, s7 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s1 -; GFX9-NEXT: s_cselect_b32 s1, s9, s0 -; GFX9-NEXT: s_abs_i32 s7, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_xor_b32 s1, s1, s8 -; GFX9-NEXT: s_sub_i32 s10, 0, s7 -; GFX9-NEXT: s_sub_i32 s8, s1, s8 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_add_i32 s7, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s1 +; GFX9-NEXT: s_cselect_b32 s1, s7, s0 +; GFX9-NEXT: s_abs_i32 s5, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_xor_b32 s1, s1, s6 +; GFX9-NEXT: s_sub_i32 s10, 0, s5 +; GFX9-NEXT: s_sub_i32 s6, s1, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s9, v5 -; GFX9-NEXT: s_xor_b32 s6, s9, s6 -; GFX9-NEXT: s_abs_i32 s9, s9 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: v_readfirstlane_b32 s8, v5 +; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s6, s6, 31 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: v_readfirstlane_b32 s4, v2 +; GFX9-NEXT: s_abs_i32 s8, s8 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 +; GFX9-NEXT: v_readfirstlane_b32 s7, v2 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: s_mul_i32 s10, s10, s1 ; GFX9-NEXT: s_mul_hi_u32 s10, s1, s10 ; GFX9-NEXT: s_add_i32 s1, s1, s10 -; GFX9-NEXT: s_mul_hi_u32 s1, s9, s1 -; GFX9-NEXT: s_mul_i32 s10, s1, s7 -; GFX9-NEXT: s_sub_i32 s9, s9, s10 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1 +; GFX9-NEXT: s_mul_i32 s10, s1, s5 +; GFX9-NEXT: s_sub_i32 s8, s8, s10 ; GFX9-NEXT: s_add_i32 s11, s1, 1 -; GFX9-NEXT: s_sub_i32 s10, s9, s7 -; GFX9-NEXT: s_cmp_ge_u32 s9, s7 +; GFX9-NEXT: s_sub_i32 s10, s8, s5 +; GFX9-NEXT: s_cmp_ge_u32 s8, s5 ; GFX9-NEXT: s_cselect_b32 s1, s11, s1 -; GFX9-NEXT: s_cselect_b32 s9, s10, s9 +; GFX9-NEXT: s_cselect_b32 s8, s10, s8 ; GFX9-NEXT: s_add_i32 s10, s1, 1 -; GFX9-NEXT: s_cmp_ge_u32 s9, s7 -; GFX9-NEXT: s_cselect_b32 s7, s10, s1 -; GFX9-NEXT: s_abs_i32 s9, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_xor_b32 s7, s7, s6 -; GFX9-NEXT: s_sub_i32 s11, 0, s9 -; GFX9-NEXT: s_sub_i32 s6, s7, s6 +; GFX9-NEXT: s_cmp_ge_u32 s8, s5 +; GFX9-NEXT: s_cselect_b32 s5, s10, s1 +; GFX9-NEXT: s_abs_i32 s8, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: s_sub_i32 s11, 0, s8 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s10, v6 -; GFX9-NEXT: s_xor_b32 s4, s10, s4 +; GFX9-NEXT: s_xor_b32 s7, s10, s7 ; GFX9-NEXT: s_abs_i32 s10, s10 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s4, s4, 31 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_readfirstlane_b32 s5, v3 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_mul_i32 s11, s11, s7 -; GFX9-NEXT: s_mul_hi_u32 s11, s7, s11 -; GFX9-NEXT: s_add_i32 s7, s7, s11 -; GFX9-NEXT: s_mul_hi_u32 s7, s10, s7 -; GFX9-NEXT: s_mul_i32 s11, s7, s9 +; GFX9-NEXT: s_ashr_i32 s7, s7, 31 +; GFX9-NEXT: s_mov_b32 s1, s9 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s11, s11, s5 +; GFX9-NEXT: s_mul_hi_u32 s11, s5, s11 +; GFX9-NEXT: s_add_i32 s5, s5, s11 +; GFX9-NEXT: s_mul_hi_u32 s5, s10, s5 +; GFX9-NEXT: s_mul_i32 s11, s5, s8 ; GFX9-NEXT: s_sub_i32 s10, s10, s11 -; GFX9-NEXT: s_add_i32 s12, s7, 1 -; GFX9-NEXT: s_sub_i32 s11, s10, s9 -; GFX9-NEXT: s_cmp_ge_u32 s10, s9 -; GFX9-NEXT: s_cselect_b32 s7, s12, s7 +; GFX9-NEXT: s_add_i32 s12, s5, 1 +; GFX9-NEXT: s_sub_i32 s11, s10, s8 +; GFX9-NEXT: s_cmp_ge_u32 s10, s8 +; GFX9-NEXT: s_cselect_b32 s5, s12, s5 ; GFX9-NEXT: s_cselect_b32 s10, s11, s10 -; GFX9-NEXT: s_add_i32 s11, s7, 1 -; GFX9-NEXT: s_cmp_ge_u32 s10, s9 -; GFX9-NEXT: s_cselect_b32 s7, s11, s7 -; GFX9-NEXT: s_abs_i32 s9, s5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s9 -; GFX9-NEXT: s_xor_b32 s7, s7, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_sub_i32 s8, 0, s9 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_sub_i32 s4, s7, s4 +; GFX9-NEXT: s_add_i32 s11, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s10, s8 +; GFX9-NEXT: s_cselect_b32 s5, s11, s5 +; GFX9-NEXT: s_abs_i32 s8, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 ; GFX9-NEXT: v_readfirstlane_b32 s10, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_xor_b32 s5, s5, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: s_xor_b32 s4, s10, s9 +; GFX9-NEXT: s_sub_i32 s9, 0, s8 +; GFX9-NEXT: s_sub_i32 s5, s5, s7 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_abs_i32 s6, s10 -; GFX9-NEXT: s_xor_b32 s5, s10, s5 -; GFX9-NEXT: s_ashr_i32 s5, s5, 31 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_readfirstlane_b32 s7, v2 -; GFX9-NEXT: s_mul_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s8, s7, s8 -; GFX9-NEXT: s_add_i32 s7, s7, s8 +; GFX9-NEXT: s_mul_i32 s9, s9, s7 +; GFX9-NEXT: s_mul_hi_u32 s9, s7, s9 +; GFX9-NEXT: s_add_i32 s7, s7, s9 ; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 -; GFX9-NEXT: s_mul_i32 s8, s7, s9 -; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: s_mul_i32 s9, s7, s8 +; GFX9-NEXT: s_sub_i32 s6, s6, s9 ; GFX9-NEXT: s_add_i32 s10, s7, 1 -; GFX9-NEXT: s_sub_i32 s8, s6, s9 -; GFX9-NEXT: s_cmp_ge_u32 s6, s9 +; GFX9-NEXT: s_sub_i32 s9, s6, s8 +; GFX9-NEXT: s_cmp_ge_u32 s6, s8 ; GFX9-NEXT: s_cselect_b32 s7, s10, s7 -; GFX9-NEXT: s_cselect_b32 s6, s8, s6 -; GFX9-NEXT: s_add_i32 s8, s7, 1 -; GFX9-NEXT: s_cmp_ge_u32 s6, s9 -; GFX9-NEXT: s_cselect_b32 s6, s8, s7 -; GFX9-NEXT: s_xor_b32 s6, s6, s5 -; GFX9-NEXT: s_sub_i32 s5, s6, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s8 +; GFX9-NEXT: s_cselect_b32 s6, s9, s7 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: s_sub_i32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v4i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: sdiv_v4i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -1371,17 +1371,17 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: sdiv_v4i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1399,7 +1399,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 2, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 2, v3 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_v4i32_4: @@ -1449,7 +1449,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1482,7 +1482,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; TONGA-LABEL: v_sdiv_i8: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -1515,18 +1515,18 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 ; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1543,7 +1543,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i8: @@ -1594,7 +1594,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i23: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1637,7 +1637,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i23: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -1680,20 +1680,20 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_sdiv_i23: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 -; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 -; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 -; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:2 +; GFX9-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:6 +; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_load_ushort v3, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1783,7 +1783,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i24: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1824,7 +1824,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i24: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -1865,20 +1865,20 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_sdiv_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 -; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 -; GFX9-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 -; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:6 +; GFX9-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_load_sbyte v2, off, s[4:7], 0 offset:2 +; GFX9-NEXT: buffer_load_ushort v3, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1962,7 +1962,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i25: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s10, s2 @@ -2009,7 +2009,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i25: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -2056,48 +2056,48 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_sdiv_i25: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 -; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: s_mov_b32 s6, s2 +; GFX9-NEXT: s_mov_b32 s7, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s4, s10 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: s_bfe_i32 s6, s0, 0x190000 -; GFX9-NEXT: s_abs_i32 s7, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x190000 +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x190000 +; GFX9-NEXT: s_abs_i32 s5, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mov_b32 s0, s8 +; GFX9-NEXT: s_sub_i32 s7, 0, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_xor_b32 s5, s4, s6 -; GFX9-NEXT: s_sub_i32 s6, 0, s7 -; GFX9-NEXT: s_abs_i32 s4, s4 +; GFX9-NEXT: s_bfe_i32 s6, s6, 0x190000 +; GFX9-NEXT: s_xor_b32 s4, s6, s4 +; GFX9-NEXT: s_abs_i32 s6, s6 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s5, s5, 31 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s8 -; GFX9-NEXT: s_mul_hi_u32 s6, s8, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s8 -; GFX9-NEXT: s_mul_i32 s8, s6, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_add_i32 s9, s6, 1 -; GFX9-NEXT: s_sub_i32 s8, s4, s7 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s6, s9, s6 -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s8, s6 -; GFX9-NEXT: s_xor_b32 s4, s4, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_i32 s7, s7, s8 +; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8 +; GFX9-NEXT: s_mul_i32 s8, s7, s5 +; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: s_add_i32 s9, s7, 1 +; GFX9-NEXT: s_sub_i32 s8, s6, s5 +; GFX9-NEXT: s_cmp_ge_u32 s6, s5 +; GFX9-NEXT: s_cselect_b32 s7, s9, s7 +; GFX9-NEXT: s_cselect_b32 s6, s8, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s5 +; GFX9-NEXT: s_cselect_b32 s5, s8, s7 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 ; GFX9-NEXT: s_bfe_i32 s4, s4, 0x190000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2189,7 +2189,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) { ; GCN-LABEL: scalarize_mulhs_4xi32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -2221,7 +2221,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; ; TONGA-LABEL: scalarize_mulhs_4xi32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) @@ -2253,21 +2253,21 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; ; GFX9-LABEL: scalarize_mulhs_4xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s4, 0x1389c755 -; GFX9-NEXT: s_mov_b32 s0, s6 -; GFX9-NEXT: s_mov_b32 s1, s7 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, 0x1389c755 +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_hi_i32 v0, v0, s4 -; GFX9-NEXT: v_mul_hi_i32 v1, v1, s4 -; GFX9-NEXT: v_mul_hi_i32 v2, v2, s4 -; GFX9-NEXT: v_mul_hi_i32 v3, v3, s4 +; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0 +; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0 +; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0 +; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1 @@ -2280,7 +2280,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: scalarize_mulhs_4xi32: diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index f4776747f16ac1..3e8768c98b5c9a 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -16,26 +16,27 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_sub_u32 s4, 0, s10 ; GCN-NEXT: s_subb_u32 s5, 0, s11 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s12, s3, 31 +; GCN-NEXT: s_add_u32 s2, s2, s12 +; GCN-NEXT: s_mov_b32 s13, s12 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: s_add_u32 s2, s2, s12 -; GCN-NEXT: s_mov_b32 s13, s12 +; GCN-NEXT: s_addc_u32 s3, s3, s12 +; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: s_addc_u32 s3, s3, s12 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -57,7 +58,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 -; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 @@ -140,23 +140,23 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GCN-IR-LABEL: s_test_sdiv: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 -; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_ashr_i32 s2, s9, 31 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] -; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_sub_u32 s12, s6, s0 -; GCN-IR-NEXT: s_subb_u32 s13, s7, s0 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 -; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 +; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31 +; GCN-IR-NEXT: s_mov_b32 s5, s4 +; GCN-IR-NEXT: s_ashr_i32 s6, s9, 31 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GCN-IR-NEXT: s_mov_b32 s7, s6 +; GCN-IR-NEXT: s_sub_u32 s12, s2, s4 +; GCN-IR-NEXT: s_subb_u32 s13, s3, s4 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[8:9], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s2, s2, s6 +; GCN-IR-NEXT: s_subb_u32 s3, s3, s6 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[12:13], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s14, s[6:7] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 +; GCN-IR-NEXT: s_flbit_i32_b64 s14, s[2:3] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[8:9] ; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[12:13] ; GCN-IR-NEXT: s_sub_u32 s16, s14, s20 @@ -181,8 +181,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s18 -; GCN-IR-NEXT: s_add_u32 s18, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 +; GCN-IR-NEXT: s_add_u32 s18, s2, -1 +; GCN-IR-NEXT: s_addc_u32 s19, s3, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] ; GCN-IR-NEXT: s_add_u32 s12, s8, s20 ; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 @@ -200,7 +200,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 ; GCN-IR-NEXT: s_and_b32 s8, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s16, s16, s14 ; GCN-IR-NEXT: s_subb_u32 s17, s17, s15 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 @@ -210,18 +210,18 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow7 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[10:11], 1 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[2:3] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 -; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[10:11], s[4:5] +; GCN-IR-NEXT: s_sub_u32 s4, s6, s4 +; GCN-IR-NEXT: s_subb_u32 s5, s7, s5 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = sdiv i64 %x, %y store i64 %result, ptr addrspace(1) %out @@ -460,62 +460,62 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s4, s4, s8 -; GCN-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-NEXT: s_or_b32 s6, s4, 1 +; GCN-NEXT: s_xor_b32 s0, s0, s8 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_or_b32 s2, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN-NEXT: s_cselect_b32 s4, s6, 0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 -; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-IR-NEXT: s_or_b32 s6, s4, 1 +; GCN-IR-NEXT: s_xor_b32 s0, s0, s8 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: s_or_b32 s2, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 @@ -587,8 +587,8 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s8, s[2:3], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -628,8 +628,8 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s8, s[2:3], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s8, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -676,14 +676,14 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -720,14 +720,14 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -771,62 +771,62 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s4, s4, s8 -; GCN-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-NEXT: s_or_b32 s6, s4, 1 +; GCN-NEXT: s_xor_b32 s0, s0, s8 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_or_b32 s2, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN-NEXT: s_cselect_b32 s4, s6, 0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 -; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-IR-NEXT: s_or_b32 s6, s4, 1 +; GCN-IR-NEXT: s_xor_b32 s0, s0, s8 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: s_or_b32 s2, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 41 %2 = ashr i64 %y, 41 @@ -838,14 +838,14 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -882,14 +882,14 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -933,94 +933,94 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_sdiv24_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[12:13], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_ashr_i64 s[6:7], s[12:13], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 ; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GCN-NEXT: s_ashr_i64 s[0:1], s[10:11], 40 +; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], 40 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s1, s8, s2 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-NEXT: s_xor_b32 s5, s8, s6 +; GCN-NEXT: s_ashr_i32 s5, s5, 30 ; GCN-NEXT: s_ashr_i64 s[10:11], s[14:15], 40 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v2 +; GCN-NEXT: s_or_b32 s5, s5, 1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s5, v2 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s10 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GCN-NEXT: s_xor_b32 s4, s4, s10 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_or_b32 s2, s0, 1 +; GCN-NEXT: s_or_b32 s6, s4, 1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s2, 0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[12:13], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[12:13], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s6 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[10:11], 40 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[10:11], 40 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s1, s8, s2 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-IR-NEXT: s_xor_b32 s5, s8, s6 +; GCN-IR-NEXT: s_ashr_i32 s5, s5, 30 ; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[14:15], 40 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-IR-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s1, v2 +; GCN-IR-NEXT: s_or_b32 s5, s5, 1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s5, v2 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GCN-IR-NEXT: s_xor_b32 s0, s0, s10 -; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GCN-IR-NEXT: s_xor_b32 s4, s4, s10 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-IR-NEXT: s_or_b32 s2, s0, 1 +; GCN-IR-NEXT: s_or_b32 s6, s4, 1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-IR-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; GCN-IR-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr <2 x i64> %x, %2 = ashr <2 x i64> %y, @@ -1032,18 +1032,20 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_sdiv24_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sext_i32_i16 s2, s7 -; GCN-NEXT: s_sext_i32_i16 s1, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_sext_i32_i16 s1, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_alignbit_b32 v2, s2, v2, 24 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_sext_i32_i16 s0, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_alignbit_b32 v2, s0, v2, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 @@ -1055,38 +1057,36 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; GCN-NEXT: buffer_store_short v1, off, s[8:11], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[4:5], 24 -; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 +; GCN-IR-NEXT: s_sext_i32_i16 s1, s1 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 24 +; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 ; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 24 ; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 16 ; GCN-IR-NEXT: s_ashr_i32 s0, s1, 31 -; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16 -; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 16 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] -; GCN-IR-NEXT: s_mov_b32 s5, s4 +; GCN-IR-NEXT: s_mov_b32 s3, s2 ; GCN-IR-NEXT: s_sub_u32 s12, s6, s0 ; GCN-IR-NEXT: s_subb_u32 s13, s7, s0 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] -; GCN-IR-NEXT: s_sub_u32 s6, s6, s4 -; GCN-IR-NEXT: s_subb_u32 s7, s7, s4 +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 +; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[12:13], 0 ; GCN-IR-NEXT: s_flbit_i32_b64 s14, s[6:7] @@ -1146,19 +1146,19 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end -; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 ; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 -; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s14, -1 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: buffer_store_short v0, off, s[12:15], 0 offset:4 +; GCN-IR-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GCN-IR-NEXT: s_waitcnt expcnt(0) ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_sdiv_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_sdiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31 @@ -1853,7 +1853,7 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_sdiv24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1880,7 +1880,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_sdiv24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -1913,7 +1913,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_sdiv24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -1939,7 +1939,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_sdiv24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 5a1cc72644d47d..04a824a073a7eb 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: add_shr_i32: ; NOSDWA: ; %bb.0: -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -22,7 +22,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX89-LABEL: add_shr_i32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s2 ; GFX89-NEXT: v_mov_b32_e32 v1, s3 @@ -36,24 +36,24 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: add_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add_shr_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in, align 4 %shr = lshr i32 %a, 16 @@ -65,7 +65,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: sub_shr_i32: ; NOSDWA: ; %bb.0: -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -80,7 +80,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX89-LABEL: sub_shr_i32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s2 ; GFX89-NEXT: v_mov_b32_e32 v1, s3 @@ -94,24 +94,24 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: sub_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sub_shr_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in, align 4 %shr = lshr i32 %a, 16 @@ -123,20 +123,20 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; NOSDWA-LABEL: mul_shr_i32: ; NOSDWA: ; %bb.0: -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 -; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 -; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; NOSDWA-NEXT: flat_load_dword v4, v[0:1] ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) @@ -147,20 +147,20 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX89-LABEL: mul_shr_i32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX89-NEXT: v_mov_b32_e32 v3, s1 -; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 +; GFX89-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX89-NEXT: flat_load_dword v4, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v0, s4 -; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_u32_u24_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX89-NEXT: flat_store_dword v[0:1], v2 @@ -168,32 +168,32 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: mul_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_shr_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr i32, ptr addrspace(1) %in1, i32 %idx @@ -210,20 +210,20 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 -; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 -; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; NOSDWA-NEXT: flat_load_ushort v4, v[0:1] ; NOSDWA-NEXT: flat_load_ushort v2, v[2:3] -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2 ; NOSDWA-NEXT: flat_store_short v[0:1], v2 @@ -231,20 +231,20 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX89-LABEL: mul_i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX89-NEXT: v_mov_b32_e32 v3, s1 -; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 +; GFX89-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX89-NEXT: flat_load_ushort v4, v[0:1] ; GFX89-NEXT: flat_load_ushort v2, v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v0, s4 -; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_lo_u16_e32 v2, v4, v2 ; GFX89-NEXT: flat_store_short v[0:1], v2 @@ -252,32 +252,32 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX9-LABEL: mul_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -293,20 +293,20 @@ entry: define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 -; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 -; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; NOSDWA-NEXT: flat_load_dword v4, v[0:1] ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v4, v2 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -319,20 +319,20 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX89-LABEL: mul_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX89-NEXT: v_mov_b32_e32 v3, s1 -; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 +; GFX89-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX89-NEXT: flat_load_dword v4, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v0, s4 -; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_lo_u16_e32 v3, v4, v2 ; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -342,32 +342,32 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: mul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -383,20 +383,20 @@ entry: define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v4i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 -; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 -; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; NOSDWA-NEXT: v_mov_b32_e32 v4, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v5, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v4, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v5, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_lo_u16_e32 v6, v1, v3 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -415,20 +415,20 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX89-LABEL: mul_v4i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX89-NEXT: v_mov_b32_e32 v3, s1 -; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 +; GFX89-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v4, s4 -; GFX89-NEXT: v_mov_b32_e32 v5, s5 +; GFX89-NEXT: v_mov_b32_e32 v4, s0 +; GFX89-NEXT: v_mov_b32_e32 v5, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_lo_u16_e32 v6, v1, v3 ; GFX89-NEXT: v_mul_lo_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -441,34 +441,34 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: mul_v4i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_v4i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -484,20 +484,20 @@ entry: define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v8i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 -; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 -; NOSDWA-NEXT: v_add_u32_e32 v4, vcc, s0, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 +; NOSDWA-NEXT: v_add_u32_e32 v4, vcc, s4, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc ; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; NOSDWA-NEXT: v_mov_b32_e32 v8, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v9, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v8, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v9, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_lo_u16_e32 v10, v3, v7 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -528,20 +528,20 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX89-LABEL: mul_v8i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX89-NEXT: v_mov_b32_e32 v3, s1 -; GFX89-NEXT: v_add_u32_e32 v4, vcc, s0, v2 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 +; GFX89-NEXT: v_add_u32_e32 v4, vcc, s4, v2 ; GFX89-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc ; GFX89-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX89-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX89-NEXT: v_mov_b32_e32 v8, s4 -; GFX89-NEXT: v_mov_b32_e32 v9, s5 +; GFX89-NEXT: v_mov_b32_e32 v8, s0 +; GFX89-NEXT: v_mov_b32_e32 v9, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_lo_u16_e32 v10, v3, v7 ; GFX89-NEXT: v_mul_lo_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -560,38 +560,38 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: mul_v8i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_v8i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v3, v3, v7 ; GFX10-NEXT: v_pk_mul_lo_u16 v2, v2, v6 ; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -607,17 +607,17 @@ entry: define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 -; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 ; NOSDWA-NEXT: flat_load_ushort v4, v[0:1] ; NOSDWA-NEXT: flat_load_ushort v2, v[2:3] -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_f16_e32 v2, v4, v2 ; NOSDWA-NEXT: flat_store_short v[0:1], v2 @@ -625,17 +625,17 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v0, s6 -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_mov_b32_e32 v2, s4 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 ; GFX89-NEXT: flat_load_ushort v4, v[0:1] ; GFX89-NEXT: flat_load_ushort v2, v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v0, s4 -; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_f16_e32 v2, v4, v2 ; GFX89-NEXT: flat_store_short v[0:1], v2 @@ -643,30 +643,30 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm entry: %a = load half, ptr addrspace(1) %ina, align 4 @@ -679,17 +679,17 @@ entry: define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v2half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: flat_load_dword v3, v[0:1] -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) @@ -703,17 +703,17 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mul_v2half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v0, s6 -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_mov_b32_e32 v2, s4 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 ; GFX89-NEXT: flat_load_dword v4, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v0, s4 -; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX89-NEXT: v_mul_f16_e32 v2, v4, v2 @@ -723,30 +723,30 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mul_v2half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_v2half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm entry: %a = load <2 x half>, ptr addrspace(1) %ina, align 4 @@ -759,17 +759,17 @@ entry: define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v4half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 -; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 ; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; NOSDWA-NEXT: v_mov_b32_e32 v4, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v5, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v4, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v5, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) ; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) @@ -789,17 +789,17 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mul_v4half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v0, s6 -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_mov_b32_e32 v2, s4 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 ; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v4, s4 -; GFX89-NEXT: v_mov_b32_e32 v5, s5 +; GFX89-NEXT: v_mov_b32_e32 v4, s0 +; GFX89-NEXT: v_mov_b32_e32 v5, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX89-NEXT: v_mul_f16_e32 v1, v1, v3 @@ -812,32 +812,32 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mul_v4half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_v4half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm entry: %a = load <4 x half>, ptr addrspace(1) %ina, align 4 @@ -850,17 +850,17 @@ entry: define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v8half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v4, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v5, s7 -; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v4, s2 +; NOSDWA-NEXT: v_mov_b32_e32 v5, s3 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 ; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; NOSDWA-NEXT: v_mov_b32_e32 v8, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v9, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v8, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v9, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) ; NOSDWA-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) @@ -892,17 +892,17 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mul_v8half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v0, s6 -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_mov_b32_e32 v4, s0 -; GFX89-NEXT: v_mov_b32_e32 v5, s1 +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_mov_b32_e32 v4, s4 +; GFX89-NEXT: v_mov_b32_e32 v5, s5 ; GFX89-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX89-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX89-NEXT: v_mov_b32_e32 v8, s4 -; GFX89-NEXT: v_mov_b32_e32 v9, s5 +; GFX89-NEXT: v_mov_b32_e32 v8, s0 +; GFX89-NEXT: v_mov_b32_e32 v9, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_f16_sdwa v10, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX89-NEXT: v_mul_f16_e32 v3, v3, v7 @@ -921,36 +921,36 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mul_v8half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_v8half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm entry: %a = load <8 x half>, ptr addrspace(1) %ina, align 4 @@ -963,19 +963,19 @@ entry: define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v2, s7 -; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, s6, v0 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s3 +; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, s2, v0 ; NOSDWA-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; NOSDWA-NEXT: v_mov_b32_e32 v4, s1 -; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, s0, v0 +; NOSDWA-NEXT: v_mov_b32_e32 v4, s5 +; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, s4, v0 ; NOSDWA-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; NOSDWA-NEXT: flat_load_ubyte v2, v[1:2] ; NOSDWA-NEXT: flat_load_ubyte v3, v[3:4] -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v2, v3 ; NOSDWA-NEXT: flat_store_byte v[0:1], v2 @@ -983,19 +983,19 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX89-LABEL: mul_i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v2, s7 -; GFX89-NEXT: v_add_u32_e32 v1, vcc, s6, v0 +; GFX89-NEXT: v_mov_b32_e32 v2, s3 +; GFX89-NEXT: v_add_u32_e32 v1, vcc, s2, v0 ; GFX89-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GFX89-NEXT: v_mov_b32_e32 v4, s1 -; GFX89-NEXT: v_add_u32_e32 v3, vcc, s0, v0 +; GFX89-NEXT: v_mov_b32_e32 v4, s5 +; GFX89-NEXT: v_add_u32_e32 v3, vcc, s4, v0 ; GFX89-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX89-NEXT: flat_load_ubyte v2, v[1:2] ; GFX89-NEXT: flat_load_ubyte v3, v[3:4] -; GFX89-NEXT: v_mov_b32_e32 v0, s4 -; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_lo_u16_e32 v2, v2, v3 ; GFX89-NEXT: flat_store_byte v[0:1], v2 @@ -1003,30 +1003,30 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX9-LABEL: mul_i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1] +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v2 -; GFX9-NEXT: global_store_byte v0, v1, s[4:5] +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX10-NEXT: global_load_ubyte v2, v0, s[0:1] +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 -; GFX10-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1042,20 +1042,20 @@ entry: define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v2i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 -; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 -; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; NOSDWA-NEXT: flat_load_ushort v4, v[0:1] ; NOSDWA-NEXT: flat_load_ushort v2, v[2:3] -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) ; NOSDWA-NEXT: v_lshrrev_b16_e32 v3, 8, v4 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) @@ -1070,20 +1070,20 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_v2i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX89-NEXT: v_mov_b32_e32 v3, s1 -; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 +; GFX89-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX89-NEXT: flat_load_ushort v4, v[0:1] ; GFX89-NEXT: flat_load_ushort v2, v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v0, s4 -; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_lo_u16_e32 v3, v4, v2 ; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 @@ -1093,30 +1093,30 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_v2i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v1, v2 ; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_v2i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b16 v0, 8, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1126,7 +1126,7 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v3 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: global_store_short v2, v0, s[4:5] +; GFX10-NEXT: global_store_short v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1142,20 +1142,20 @@ entry: define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v4i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 -; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 ; NOSDWA-NEXT: flat_load_dword v4, v[0:1] -; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; NOSDWA-NEXT: flat_load_dword v2, v[0:1] -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 24, v4 @@ -1182,20 +1182,20 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_v4i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX89-NEXT: v_mov_b32_e32 v3, s1 -; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 +; GFX89-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX89-NEXT: flat_load_dword v4, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v0, s4 -; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_lo_u16_e32 v3, v4, v2 ; GFX89-NEXT: v_mul_lo_u16_sdwa v5, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 @@ -1209,12 +1209,12 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_v4i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v1, v2 @@ -1224,19 +1224,19 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_v4i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v1 @@ -1255,7 +1255,7 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1271,20 +1271,20 @@ entry: define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v8i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 -; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 -; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; NOSDWA-NEXT: v_mov_b32_e32 v4, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v5, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v4, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v5, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) ; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 24, v0 @@ -1330,20 +1330,20 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_v8i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX89-NEXT: v_mov_b32_e32 v3, s1 -; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 +; GFX89-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v4, s4 -; GFX89-NEXT: v_mov_b32_e32 v5, s5 +; GFX89-NEXT: v_mov_b32_e32 v4, s0 +; GFX89-NEXT: v_mov_b32_e32 v5, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_lo_u16_e32 v6, v1, v3 ; GFX89-NEXT: v_mul_lo_u16_sdwa v7, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 @@ -1364,12 +1364,12 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_v8i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v1, v3 @@ -1386,19 +1386,19 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul_v8i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b16 v6, 8, v0 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1449,7 +1449,7 @@ entry: define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; NOSDWA-LABEL: sitofp_v2i16_to_v2f16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -1467,7 +1467,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX89-LABEL: sitofp_v2i16_to_v2f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s2 ; GFX89-NEXT: v_mov_b32_e32 v1, s3 @@ -1483,29 +1483,29 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX9-LABEL: sitofp_v2i16_to_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f16_i16_e32 v2, v1 ; GFX9-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sitofp_v2i16_to_v2f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f16_i16_e32 v2, v1 ; GFX10-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { @@ -1519,17 +1519,17 @@ entry: define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mac_v2half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: flat_load_dword v3, v[0:1] -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) @@ -1543,17 +1543,17 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mac_v2half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v0, s6 -; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 -; GFX89-NEXT: v_mov_b32_e32 v1, s7 +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: v_mov_b32_e32 v2, s4 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 ; GFX89-NEXT: flat_load_dword v2, v[2:3] ; GFX89-NEXT: flat_load_dword v3, v[0:1] -; GFX89-NEXT: v_mov_b32_e32 v0, s4 -; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(1) ; GFX89-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX89-NEXT: s_waitcnt vmcnt(0) @@ -1566,32 +1566,32 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mac_v2half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX9-NEXT: v_pk_add_f16 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mac_v2half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX10-NEXT: v_pk_add_f16 v1, v1, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm entry: %a = load <2 x half>, ptr addrspace(1) %ina, align 4 @@ -1605,7 +1605,7 @@ entry: define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: immediate_mul_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -1625,7 +1625,7 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX89-LABEL: immediate_mul_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX89-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX89-NEXT: v_mov_b32_e32 v3, 0x141 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) @@ -1644,27 +1644,27 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: immediate_mul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s0, 0x141007b ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0x141007b ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: immediate_mul_v2i16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0x141007b, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1679,20 +1679,20 @@ entry: define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mulmul_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 -; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 -; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: flat_load_dword v3, v[0:1] -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) @@ -1708,20 +1708,20 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX89-LABEL: mulmul_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX89-NEXT: v_mov_b32_e32 v3, s1 -; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 +; GFX89-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX89-NEXT: flat_load_dword v4, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v0, s4 -; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_mul_lo_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX89-NEXT: v_mul_lo_u16_e32 v4, v4, v2 @@ -1733,34 +1733,34 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: mulmul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mulmul_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1777,16 +1777,16 @@ entry: define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: add_bb_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 -; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 ; NOSDWA-NEXT: flat_load_dword v1, v[0:1] ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) @@ -1796,56 +1796,56 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; NOSDWA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; NOSDWA-NEXT: v_or_b32_e32 v2, v1, v2 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2 ; NOSDWA-NEXT: s_endpgm ; ; GFX89-LABEL: add_bb_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v0, s6 -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_mov_b32_e32 v2, s4 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 ; GFX89-NEXT: flat_load_dword v1, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v0, s4 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_add_u32_sdwa v3, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX89-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX89-NEXT: v_or_b32_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: flat_store_dword v[0:1], v2 ; GFX89-NEXT: s_endpgm ; ; GFX9-LABEL: add_bb_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add_bb_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm entry: %a = load <2 x i16>, ptr addrspace(1) %ina, align 4 @@ -1863,7 +1863,7 @@ store_label: define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrspace(1) %destValues) #0 { ; NOSDWA-LABEL: pulled_out_test: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 @@ -1894,7 +1894,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; ; GFX89-LABEL: pulled_out_test: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s0 ; GFX89-NEXT: v_mov_b32_e32 v1, s1 @@ -1925,58 +1925,58 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; ; GFX9-LABEL: pulled_out_test: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: s_lshr_b32 s3, s1, 24 -; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_and_b32 s2, s1, 0xffff +; GFX9-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NEXT: s_lshr_b32 s7, s0, 24 +; GFX9-NEXT: s_and_b32 s4, s1, 0xffff ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX9-NEXT: s_and_b32 s4, s0, 0xffff +; GFX9-NEXT: s_and_b32 s6, s0, 0xffff ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_or_b32 s1, s1, s3 -; GFX9-NEXT: s_or_b32 s0, s0, s5 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s5 +; GFX9-NEXT: s_or_b32 s0, s0, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s1, s2, s1 -; GFX9-NEXT: s_or_b32 s0, s4, s0 +; GFX9-NEXT: s_or_b32 s1, s4, s1 +; GFX9-NEXT: s_or_b32 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: pulled_out_test: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: s_lshr_b32 s3, s0, 24 -; GFX10-NEXT: s_lshr_b32 s5, s1, 24 -; GFX10-NEXT: s_and_b32 s2, s0, 0xffff +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_lshr_b32 s7, s1, 24 +; GFX10-NEXT: s_and_b32 s4, s0, 0xffff ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_and_b32 s4, s1, 0xffff +; GFX10-NEXT: s_and_b32 s6, s1, 0xffff ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_or_b32 s0, s0, s3 -; GFX10-NEXT: s_or_b32 s1, s1, s5 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_or_b32 s0, s0, s5 +; GFX10-NEXT: s_or_b32 s1, s1, s7 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_or_b32 s1, s4, s1 +; GFX10-NEXT: s_or_b32 s0, s4, s0 +; GFX10-NEXT: s_or_b32 s1, s6, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm entry: %idxprom = ashr exact i64 15, 32 @@ -2200,17 +2200,17 @@ bb2: define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mac_v2half_same_srcop: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) -; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: flat_load_dword v3, v[0:1] -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 -; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) @@ -2224,17 +2224,17 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr ; ; GFX89-LABEL: mac_v2half_same_srcop: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_mov_b32_e32 v0, s6 -; GFX89-NEXT: v_mov_b32_e32 v1, s7 -; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_mov_b32_e32 v2, s4 +; GFX89-NEXT: v_mov_b32_e32 v3, s5 ; GFX89-NEXT: flat_load_dword v4, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] -; GFX89-NEXT: v_mov_b32_e32 v0, s4 -; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: s_waitcnt vmcnt(1) ; GFX89-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; GFX89-NEXT: s_waitcnt vmcnt(0) @@ -2247,34 +2247,34 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: mac_v2half_same_srcop: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mac_v2half_same_srcop: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[0:1] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v1, v1, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm entry: %a = load <2 x half>, ptr addrspace(1) %ina, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll index f11e86aef683d1..5eb3ae8d9a8fd3 100644 --- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.sffbh.i32(i32) nounwind readnone speculatable define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GCN-LABEL: select_constant_cttz: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 84ca5dd0c18633..572026da79646c 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -6,35 +6,35 @@ define amdgpu_kernel void @select_f16( ; SI-LABEL: select_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s18, s14 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: s_mov_b32 s19, s15 -; SI-NEXT: s_mov_b32 s20, s8 -; SI-NEXT: s_mov_b32 s21, s9 -; SI-NEXT: s_mov_b32 s22, s14 -; SI-NEXT: s_mov_b32 s23, s15 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s16, s10 +; SI-NEXT: s_mov_b32 s17, s11 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s20, s12 +; SI-NEXT: s_mov_b32 s21, s13 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s12, s14 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc +; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -42,67 +42,67 @@ define amdgpu_kernel void @select_f16( ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: select_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s18, s14 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x44 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s6 -; VI-NEXT: s_mov_b32 s17, s7 -; VI-NEXT: s_mov_b32 s19, s15 -; VI-NEXT: s_mov_b32 s20, s8 -; VI-NEXT: s_mov_b32 s21, s9 -; VI-NEXT: s_mov_b32 s22, s14 -; VI-NEXT: s_mov_b32 s23, s15 -; VI-NEXT: s_mov_b32 s8, s10 -; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s14 -; VI-NEXT: s_mov_b32 s11, s15 -; VI-NEXT: s_mov_b32 s2, s14 -; VI-NEXT: s_mov_b32 s3, s15 +; VI-NEXT: s_mov_b32 s16, s10 +; VI-NEXT: s_mov_b32 s17, s11 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s20, s12 +; VI-NEXT: s_mov_b32 s21, s13 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s12, s14 +; VI-NEXT: s_mov_b32 s13, s15 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s0, s8 +; VI-NEXT: s_mov_b32 s1, s9 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: buffer_store_short v0, off, s[12:15], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: select_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 -; GFX11-NEXT: s_mov_b32 s14, -1 -; GFX11-NEXT: s_mov_b32 s15, 0x31016000 -; GFX11-NEXT: s_mov_b32 s18, s14 -; GFX11-NEXT: s_mov_b32 s19, s15 -; GFX11-NEXT: s_mov_b32 s22, s14 -; GFX11-NEXT: s_mov_b32 s23, s15 -; GFX11-NEXT: s_mov_b32 s26, s14 -; GFX11-NEXT: s_mov_b32 s27, s15 -; GFX11-NEXT: s_mov_b32 s2, s14 -; GFX11-NEXT: s_mov_b32 s3, s15 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s6 +; GFX11-NEXT: s_mov_b32 s19, s7 +; GFX11-NEXT: s_mov_b32 s22, s6 +; GFX11-NEXT: s_mov_b32 s23, s7 +; GFX11-NEXT: s_mov_b32 s26, s6 +; GFX11-NEXT: s_mov_b32 s27, s7 +; GFX11-NEXT: s_mov_b32 s2, s6 +; GFX11-NEXT: s_mov_b32 s3, s7 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s16, s6 -; GFX11-NEXT: s_mov_b32 s17, s7 -; GFX11-NEXT: s_mov_b32 s20, s8 -; GFX11-NEXT: s_mov_b32 s21, s9 -; GFX11-NEXT: s_mov_b32 s24, s10 -; GFX11-NEXT: s_mov_b32 s25, s11 +; GFX11-NEXT: s_mov_b32 s16, s10 +; GFX11-NEXT: s_mov_b32 s17, s11 +; GFX11-NEXT: s_mov_b32 s20, s12 +; GFX11-NEXT: s_mov_b32 s21, s13 +; GFX11-NEXT: s_mov_b32 s24, s14 +; GFX11-NEXT: s_mov_b32 s25, s15 ; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc @@ -111,11 +111,11 @@ define amdgpu_kernel void @select_f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s4 -; GFX11-NEXT: s_mov_b32 s13, s5 +; GFX11-NEXT: s_mov_b32 s4, s8 +; GFX11-NEXT: s_mov_b32 s5, s9 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: buffer_store_b16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -137,7 +137,7 @@ entry: define amdgpu_kernel void @select_f16_imm_a( ; SI-LABEL: select_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -172,7 +172,7 @@ define amdgpu_kernel void @select_f16_imm_a( ; ; VI-LABEL: select_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -203,7 +203,7 @@ define amdgpu_kernel void @select_f16_imm_a( ; ; GFX11-LABEL: select_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -248,7 +248,7 @@ entry: define amdgpu_kernel void @select_f16_imm_b( ; SI-LABEL: select_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -283,7 +283,7 @@ define amdgpu_kernel void @select_f16_imm_b( ; ; VI-LABEL: select_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -314,7 +314,7 @@ define amdgpu_kernel void @select_f16_imm_b( ; ; GFX11-LABEL: select_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -359,7 +359,7 @@ entry: define amdgpu_kernel void @select_f16_imm_c( ; SI-LABEL: select_f16_imm_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -394,7 +394,7 @@ define amdgpu_kernel void @select_f16_imm_c( ; ; VI-LABEL: select_f16_imm_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -426,7 +426,7 @@ define amdgpu_kernel void @select_f16_imm_c( ; ; GFX11-LABEL: select_f16_imm_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -471,7 +471,7 @@ entry: define amdgpu_kernel void @select_f16_imm_d( ; SI-LABEL: select_f16_imm_d: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -506,7 +506,7 @@ define amdgpu_kernel void @select_f16_imm_d( ; ; VI-LABEL: select_f16_imm_d: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -538,7 +538,7 @@ define amdgpu_kernel void @select_f16_imm_d( ; ; GFX11-LABEL: select_f16_imm_d: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -583,31 +583,31 @@ entry: define amdgpu_kernel void @select_v2f16( ; SI-LABEL: select_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s18, s14 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: s_mov_b32 s19, s15 -; SI-NEXT: s_mov_b32 s20, s8 -; SI-NEXT: s_mov_b32 s21, s9 -; SI-NEXT: s_mov_b32 s22, s14 -; SI-NEXT: s_mov_b32 s23, s15 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s16, s10 +; SI-NEXT: s_mov_b32 s17, s11 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s20, s12 +; SI-NEXT: s_mov_b32 s21, s13 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 +; SI-NEXT: s_mov_b32 s12, s14 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 -; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -632,36 +632,36 @@ define amdgpu_kernel void @select_v2f16( ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: select_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 -; VI-NEXT: s_mov_b32 s2, s14 -; VI-NEXT: s_mov_b32 s3, s15 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x44 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s6 -; VI-NEXT: s_mov_b32 s17, s7 -; VI-NEXT: s_mov_b32 s18, s14 -; VI-NEXT: s_mov_b32 s19, s15 -; VI-NEXT: s_mov_b32 s20, s8 -; VI-NEXT: s_mov_b32 s21, s9 -; VI-NEXT: s_mov_b32 s22, s14 -; VI-NEXT: s_mov_b32 s23, s15 -; VI-NEXT: s_mov_b32 s8, s10 -; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s14 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s16, s10 +; VI-NEXT: s_mov_b32 s17, s11 +; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s20, s12 +; VI-NEXT: s_mov_b32 s21, s13 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s12, s14 +; VI-NEXT: s_mov_b32 s13, s15 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 -; VI-NEXT: s_mov_b32 s11, s15 -; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v3, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s8 +; VI-NEXT: s_mov_b32 s1, s9 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -676,18 +676,18 @@ define amdgpu_kernel void @select_v2f16( ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: select_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[12:13], s[2:3], 0x44 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x44 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s2 -; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s6, s2 +; GFX11-NEXT: s_mov_b32 s7, s3 ; GFX11-NEXT: s_mov_b32 s22, s2 ; GFX11-NEXT: s_mov_b32 s23, s3 ; GFX11-NEXT: s_mov_b32 s18, s2 @@ -695,18 +695,18 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-NEXT: s_mov_b32 s26, s2 ; GFX11-NEXT: s_mov_b32 s27, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s20, s8 -; GFX11-NEXT: s_mov_b32 s21, s9 -; GFX11-NEXT: s_mov_b32 s16, s6 -; GFX11-NEXT: s_mov_b32 s17, s7 -; GFX11-NEXT: s_mov_b32 s24, s10 -; GFX11-NEXT: s_mov_b32 s25, s11 -; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s20, s12 +; GFX11-NEXT: s_mov_b32 s21, s13 +; GFX11-NEXT: s_mov_b32 s16, s10 +; GFX11-NEXT: s_mov_b32 s17, s11 +; GFX11-NEXT: s_mov_b32 s24, s14 +; GFX11-NEXT: s_mov_b32 s25, s15 +; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 ; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v3, off, s[24:27], 0 -; GFX11-NEXT: s_mov_b32 s0, s4 -; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_mov_b32 s0, s8 +; GFX11-NEXT: s_mov_b32 s1, s9 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(2) @@ -742,7 +742,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_a( ; SI-LABEL: select_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -789,7 +789,7 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; ; VI-LABEL: select_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -827,7 +827,7 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; ; GFX11-LABEL: select_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -881,7 +881,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_b( ; SI-LABEL: select_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -928,7 +928,7 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; ; VI-LABEL: select_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -966,7 +966,7 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; ; GFX11-LABEL: select_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1020,7 +1020,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_c( ; SI-LABEL: select_v2f16_imm_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1067,7 +1067,7 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; ; VI-LABEL: select_v2f16_imm_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s18, s10 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; ; GFX11-LABEL: select_v2f16_imm_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s10 @@ -1161,7 +1161,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_d( ; SI-LABEL: select_v2f16_imm_d: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; ; VI-LABEL: select_v2f16_imm_d: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s18, s10 @@ -1248,7 +1248,7 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; ; GFX11-LABEL: select_v2f16_imm_d: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s10 diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll index 31a802b7428b95..8f944269869df1 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: sext_i16_to_i32_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -25,15 +25,15 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @sext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: sext_i16_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-NEXT: s_add_u32 s4, s6, s4 -; GCN-NEXT: s_addc_u32 s5, s7, s5 +; GCN-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x100000 +; GCN-NEXT: s_add_u32 s4, s8, s4 +; GCN-NEXT: s_addc_u32 s5, s9, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -47,12 +47,12 @@ define amdgpu_kernel void @sext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @sext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: sext_i16_to_i32_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -68,12 +68,12 @@ define amdgpu_kernel void @sext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 define amdgpu_kernel void @sext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: sext_i16_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -89,15 +89,15 @@ define amdgpu_kernel void @sext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 define amdgpu_kernel void @sext_i32_to_i64_uniform(ptr addrspace(1) %out, i32 %a, i64 %b) { ; GCN-LABEL: sext_i32_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s7, s6, 31 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_ashr_i32 s5, s8, 31 +; GCN-NEXT: s_add_u32 s4, s6, s8 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -111,12 +111,12 @@ define amdgpu_kernel void @sext_i32_to_i64_uniform(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @sext_i32_to_i64_divergent(ptr addrspace(1) %out, i32 %a, i64 %b) { ; GCN-LABEL: sext_i32_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index 0630cca7c099b8..be31078d86860e 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -11,27 +11,27 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s0, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0xf ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB0_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_add_i32 s7, s7, s0 +; SI-NEXT: s_add_i32 s3, s3, s6 ; SI-NEXT: s_cbranch_execnz .LBB0_3 ; SI-NEXT: .LBB0_2: ; %if -; SI-NEXT: s_sub_i32 s7, s5, s6 +; SI-NEXT: s_sub_i32 s3, s1, s2 ; SI-NEXT: .LBB0_3: ; %endif -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_add_i32 s4, s7, s4 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_add_i32 s0, s3, s0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB0_4: -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr3 ; SI-NEXT: s_branch .LBB0_2 entry: @@ -56,32 +56,32 @@ endif: define amdgpu_kernel void @sgpr_if_else_salu_br_opt(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br_opt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dword s2, s[4:5], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s0, s[2:3], 0x2e -; SI-NEXT: s_load_dword s1, s[2:3], 0x37 +; SI-NEXT: s_load_dword s0, s[4:5], 0x2e +; SI-NEXT: s_load_dword s1, s[4:5], 0x37 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s5, s0, s1 +; SI-NEXT: s_add_i32 s3, s0, s1 ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: .LBB1_2: ; %if -; SI-NEXT: s_load_dword s0, s[2:3], 0x1c -; SI-NEXT: s_load_dword s1, s[2:3], 0x25 +; SI-NEXT: s_load_dword s0, s[4:5], 0x1c +; SI-NEXT: s_load_dword s1, s[4:5], 0x25 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s5, s0, s1 +; SI-NEXT: s_add_i32 s3, s0, s1 ; SI-NEXT: .LBB1_3: ; %endif -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_add_i32 s4, s5, s4 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_add_i32 s0, s3, s2 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB1_4: -; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr3 ; SI-NEXT: s_branch .LBB1_2 entry: @@ -108,28 +108,27 @@ endif: define amdgpu_kernel void @sgpr_if_else_valu_br(ptr addrspace(1) %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_valu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xc +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SI-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; SI-NEXT: s_cbranch_execz .LBB2_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s8, s6, s7 +; SI-NEXT: s_add_i32 s8, s2, s3 ; SI-NEXT: .LBB2_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[2:3], s[6:7] ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_xor_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execz .LBB2_4 +; SI-NEXT: s_xor_b64 exec, exec, s[2:3] ; SI-NEXT: ; %bb.3: ; %if -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: .LBB2_4: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_add_i32 s0, s0, s1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: ; %bb.4: ; %endif +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -158,9 +157,9 @@ endif: define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: sgpr_if_else_valu_cmp_phi_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 @@ -168,22 +167,22 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, p ; SI-NEXT: s_xor_b64 s[10:11], exec, s[10:11] ; SI-NEXT: s_cbranch_execz .LBB3_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: .LBB3_2: ; %Flow ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_andn2_saveexec_b64 s[0:1], s[10:11] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] ; SI-NEXT: s_cbranch_execz .LBB3_4 ; SI-NEXT: ; %bb.3: ; %if ; SI-NEXT: s_mov_b32 s15, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 ; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], exec @@ -192,11 +191,11 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, p ; SI-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-NEXT: s_or_b64 s[8:9], s[2:3], s[6:7] ; SI-NEXT: .LBB3_4: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[8:9] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll index 3d8807a88a46c1..664eb0c037cac9 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll @@ -4,7 +4,7 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3 target triple = "amdgcn-amd-amdhsa" ; CHECK-LABEL: {{^}}t0: -; CHECK: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[6:7], 0x0 +; CHECK: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[8:9], 0x0 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]] ; There should be no redundant copies from PTR_HI. ; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]] diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll index 59036c64c8afcc..455d22f2aa29cd 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -7,9 +7,9 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 { ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 s0, s0, s13 +; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_load_dword s4, s[6:7], 0x2 +; GCN-NEXT: s_load_dword s4, s[8:9], 0x2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll index b339915edd2061..47810346c50b7d 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll @@ -12,20 +12,26 @@ define amdgpu_kernel void @kernel() { ; GCN-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s38, -1 +; GCN-NEXT: ; implicit-def: $vgpr40 : SGPR spill to VGPR lane ; GCN-NEXT: s_mov_b32 s39, 0xe00000 -; GCN-NEXT: s_add_u32 s36, s36, s9 +; GCN-NEXT: v_writelane_b32 v40, s4, 0 +; GCN-NEXT: s_add_u32 s36, s36, s11 +; GCN-NEXT: v_writelane_b32 v40, s5, 1 ; GCN-NEXT: s_addc_u32 s37, s37, 0 -; GCN-NEXT: s_mov_b32 s14, s8 -; GCN-NEXT: s_add_u32 s8, s2, 36 -; GCN-NEXT: s_addc_u32 s9, s3, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] ; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_readlane_b32 s0, v40, 0 +; GCN-NEXT: s_mov_b32 s13, s9 +; GCN-NEXT: s_mov_b32 s12, s8 +; GCN-NEXT: v_readlane_b32 s1, v40, 1 +; GCN-NEXT: s_add_u32 s8, s0, 36 +; GCN-NEXT: s_addc_u32 s9, s1, 0 ; GCN-NEXT: s_getpc_b64 s[0:1] ; GCN-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 -; GCN-NEXT: s_mov_b32 s13, s7 -; GCN-NEXT: s_mov_b32 s12, s6 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GCN-NEXT: s_mov_b32 s14, s10 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] @@ -33,7 +39,7 @@ define amdgpu_kernel void @kernel() { ; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_endpgm call void @foo() ret void diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index ebc916b5c889b5..fc6ad39db5b89f 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -181,7 +181,7 @@ define i128 @v_ashr_i128_kv(i128 %rhs) { define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_shl_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -214,7 +214,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_lshr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -247,7 +247,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_ashr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -430,7 +430,7 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 @@ -502,7 +502,7 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 @@ -574,7 +574,7 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 47ab5ba666877a..6b4bca11d80c78 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workgroup.id.x() #0 define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -70,7 +70,7 @@ define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -92,7 +92,7 @@ define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -140,7 +140,7 @@ define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -159,7 +159,7 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: shl_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -214,40 +214,40 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @shl_i16_v_s(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %b) { ; SI-LABEL: shl_i16_v_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s12, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_v_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s12, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s12, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_i16_v_s: @@ -287,42 +287,42 @@ define amdgpu_kernel void @shl_i16_v_s(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_i16_v_compute_s(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %b) { ; SI-LABEL: shl_i16_v_compute_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s12, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_v_compute_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s12, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s12, s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_i16_v_compute_s: @@ -370,7 +370,7 @@ define amdgpu_kernel void @shl_i16_v_compute_s(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_i16_computed_amount: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -396,7 +396,7 @@ define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: shl_i16_computed_amount: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -472,24 +472,24 @@ define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) { ; SI-LABEL: shl_i16_i_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 12 +; SI-NEXT: s_lshl_b32 s4, s6, 12 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_i_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 12 +; VI-NEXT: s_lshl_b32 s4, s6, 12 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -530,7 +530,7 @@ define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) { define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -561,7 +561,7 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -630,7 +630,7 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -659,7 +659,7 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -752,7 +752,7 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -770,7 +770,7 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: shl_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -819,7 +819,7 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -839,7 +839,7 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -903,7 +903,7 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -929,7 +929,7 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; VI-NEXT: s_mov_b32 s19, 0xf000 @@ -1029,25 +1029,25 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: s_shl_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -1070,27 +1070,27 @@ define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a define amdgpu_kernel void @v_shl_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_shl_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_ashr_i32 s7, s6, 31 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_ashr_i32 s9, s8, 31 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: s_lshl_b64 s[2:3], s[8:9], 3 ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_shl_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_ashr_i32 s7, s6, 31 -; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_ashr_i32 s9, s8, 31 +; VI-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, s4 @@ -1137,7 +1137,7 @@ define amdgpu_kernel void @v_shl_32_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { ; SI-LABEL: s_shl_constant_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s9, 0xffff ; SI-NEXT: s_mov_b32 s8, s6 @@ -1153,7 +1153,7 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: s_shl_constant_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s9, 0xffff ; VI-NEXT: s_mov_b32 s8, s6 @@ -1195,7 +1195,7 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-LABEL: v_shl_constant_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1215,7 +1215,7 @@ define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: v_shl_constant_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-LABEL: v_shl_i64_32_bit_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_shl_i64_32_bit_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -1331,7 +1331,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-LABEL: v_shl_inline_imm_64_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1349,7 +1349,7 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_shl_inline_imm_64_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -1394,12 +1394,12 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_64_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], 64, s4 +; SI-NEXT: s_lshl_b64 s[4:5], 64, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1407,12 +1407,12 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: s_shl_inline_imm_64_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], 64, s4 +; VI-NEXT: s_lshl_b64 s[4:5], 64, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1444,12 +1444,12 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_1_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], 1, s4 +; SI-NEXT: s_lshl_b64 s[4:5], 1, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1457,12 +1457,12 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: s_shl_inline_imm_1_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], 1, s4 +; VI-NEXT: s_lshl_b64 s[4:5], 1, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1495,12 +1495,12 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_1_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], 1.0, s4 +; SI-NEXT: s_lshl_b64 s[4:5], 1.0, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1508,12 +1508,12 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_1_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], 1.0, s4 +; VI-NEXT: s_lshl_b64 s[4:5], 1.0, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1542,12 +1542,12 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_1_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], -1.0, s4 +; SI-NEXT: s_lshl_b64 s[4:5], -1.0, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1555,12 +1555,12 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_1_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], -1.0, s4 +; VI-NEXT: s_lshl_b64 s[4:5], -1.0, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1589,12 +1589,12 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_0_5_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], 0.5, s4 +; SI-NEXT: s_lshl_b64 s[4:5], 0.5, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1602,12 +1602,12 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_0_5_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], 0.5, s4 +; VI-NEXT: s_lshl_b64 s[4:5], 0.5, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1636,12 +1636,12 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_0_5_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], -0.5, s4 +; SI-NEXT: s_lshl_b64 s[4:5], -0.5, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1649,12 +1649,12 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_0_5_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], -0.5, s4 +; VI-NEXT: s_lshl_b64 s[4:5], -0.5, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1683,12 +1683,12 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_2_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], 2.0, s4 +; SI-NEXT: s_lshl_b64 s[4:5], 2.0, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1696,12 +1696,12 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_2_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], 2.0, s4 +; VI-NEXT: s_lshl_b64 s[4:5], 2.0, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1730,12 +1730,12 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_2_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], -2.0, s4 +; SI-NEXT: s_lshl_b64 s[4:5], -2.0, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1743,12 +1743,12 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_2_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], -2.0, s4 +; VI-NEXT: s_lshl_b64 s[4:5], -2.0, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1777,12 +1777,12 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], 4.0, s4 +; SI-NEXT: s_lshl_b64 s[4:5], 4.0, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1790,12 +1790,12 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], 4.0, s4 +; VI-NEXT: s_lshl_b64 s[4:5], 4.0, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1824,12 +1824,12 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], -4.0, s4 +; SI-NEXT: s_lshl_b64 s[4:5], -4.0, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1837,12 +1837,12 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], -4.0, s4 +; VI-NEXT: s_lshl_b64 s[4:5], -4.0, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1874,12 +1874,12 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_f32_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], 0x40800000, s4 +; SI-NEXT: s_lshl_b64 s[4:5], 0x40800000, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1887,12 +1887,12 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_f32_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], 0x40800000, s4 +; VI-NEXT: s_lshl_b64 s[4:5], 0x40800000, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1926,8 +1926,8 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd ; SI-NEXT: s_mov_b32 s4, -4.0 ; SI-NEXT: s_mov_b32 s5, -1 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1941,8 +1941,8 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou ; ; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s6, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 ; VI-NEXT: s_mov_b32 s4, -4.0 ; VI-NEXT: s_mov_b32 s5, -1 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -1982,8 +1982,8 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s5, 4.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1997,8 +1997,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %o ; ; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s6, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: s_mov_b32 s5, 4.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -2033,8 +2033,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %o define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s5, -4.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -2048,8 +2048,8 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1 ; ; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s6, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 ; VI-NEXT: s_mov_b32 s4, 0 ; VI-NEXT: s_mov_b32 s5, -4.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1 define amdgpu_kernel void @test_mul2(i32 %p) { ; SI-LABEL: test_mul2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2096,7 +2096,7 @@ define amdgpu_kernel void @test_mul2(i32 %p) { ; ; VI-LABEL: test_mul2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 52dbd31b2c646c..4b616e836f9169 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,20 +8,20 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s7, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -40,7 +40,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; CI-LABEL: s_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -59,19 +59,19 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX10-LABEL: s_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_lshlrev_b16 v0, s7, s6 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -88,18 +88,18 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v1, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -118,7 +118,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -140,18 +140,18 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v1, v0 -; GFX10-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -176,33 +176,33 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: shl_v_s_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, s0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, s6, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_s_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v4, s0, v3 +; VI-NEXT: v_lshlrev_b16_e32 v4, s4, v3 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -210,53 +210,51 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: shl_v_s_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dword s8, s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_lshr_b32 s4, s8, 16 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, s0, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, s4, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: shl_v_s_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_lshlrev_b16 v1, s0, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_pk_lshlrev_b16 v1, s4, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: shl_v_s_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_lshlrev_b16 v1, s0, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: v_pk_lshlrev_b16 v1, s4, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -271,33 +269,33 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: shl_s_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s6 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_s_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s0 +; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s4 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v4, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -305,53 +303,51 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: shl_s_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dword s8, s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_lshr_b32 s4, s8, 16 +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshl_b32_e32 v2, s0, v2 -; CI-NEXT: v_lshl_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshl_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshl_b32_e32 v3, s4, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: shl_s_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s0 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s4 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: shl_s_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, s0 -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, s4 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -366,18 +362,18 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_imm_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -397,7 +393,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -418,18 +414,18 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: shl_imm_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -452,18 +448,18 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_imm_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -483,7 +479,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -500,18 +496,18 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: shl_v_imm_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -534,19 +530,19 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -568,7 +564,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_shl_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -597,19 +593,19 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_shl_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -635,19 +631,19 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_imm_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -671,7 +667,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -694,19 +690,19 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: shl_v_imm_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 6ac04d8bc42bba..10fdaaa17da0a4 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -32,7 +32,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_x_sub_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -48,7 +48,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_x_sub_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -65,7 +65,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_x_sub_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -84,29 +84,29 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i32_x_sub_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -129,7 +129,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -173,7 +173,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -196,7 +196,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -221,41 +221,41 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 ; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[4:5] +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -287,7 +287,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_64_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -303,7 +303,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_64_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -319,7 +319,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_64_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -336,7 +336,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_64_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -355,29 +355,29 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_64_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v1, 64, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_64_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i32_64_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -400,7 +400,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_65: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -416,7 +416,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_x_sub_65: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -432,7 +432,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_x_sub_65: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -449,7 +449,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_x_sub_65: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -468,51 +468,51 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0x41, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -525,7 +525,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -548,7 +548,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_65_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -564,7 +564,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_65_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -580,7 +580,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_65_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -597,7 +597,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_65_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -616,29 +616,29 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_65_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v1, 0x41, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_65_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i32_65_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -661,7 +661,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_neg16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -677,7 +677,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_x_sub_neg16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -693,7 +693,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_x_sub_neg16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -710,7 +710,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_x_sub_neg16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -729,51 +729,51 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, -16, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -786,7 +786,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -809,7 +809,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_neg16_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -825,7 +825,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_neg16_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -841,7 +841,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_neg16_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -858,7 +858,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_neg16_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -877,29 +877,29 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_i32_neg16_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v1, -16, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_neg16_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v1, -16, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i32_neg16_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -922,7 +922,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_neg17: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -938,7 +938,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_x_sub_neg17: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -954,7 +954,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_x_sub_neg17: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -971,7 +971,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_x_sub_neg17: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -990,51 +990,51 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 17, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0xffffffef, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1047,7 +1047,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1070,7 +1070,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_neg17_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1086,7 +1086,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_neg17_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1102,7 +1102,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_neg17_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1119,7 +1119,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_neg17_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1138,29 +1138,29 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_i32_neg17_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v1, 0xffffffef, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_neg17_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i32_neg17_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; SI-LABEL: s_test_i32_x_sub_64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s0, s0, 64 ; SI-NEXT: ;;#ASMSTART @@ -1193,7 +1193,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; VI-LABEL: s_test_i32_x_sub_64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sub_i32 s0, s0, 64 ; VI-NEXT: ;;#ASMSTART @@ -1203,7 +1203,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; GFX9-LABEL: s_test_i32_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_i32 s0, s0, 64 ; GFX9-NEXT: ;;#ASMSTART @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; GFX10-LABEL: s_test_i32_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_i32 s0, s0, 64 ; GFX10-NEXT: ;;#ASMSTART @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; GFX11-LABEL: s_test_i32_x_sub_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s0, s0, 64 ; GFX11-NEXT: ;;#ASMSTART @@ -1238,7 +1238,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i16_x_sub_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1254,7 +1254,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i16_x_sub_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1270,7 +1270,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i16_x_sub_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1287,7 +1287,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i16_x_sub_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1306,29 +1306,29 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i16_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i16_x_sub_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1351,7 +1351,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 @@ -1369,7 +1369,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1387,7 +1387,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1405,7 +1405,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1425,32 +1425,32 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] +; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v1, s[6:7] +; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 @@ -1477,7 +1477,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1499,7 +1499,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1521,7 +1521,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1544,7 +1544,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1569,41 +1569,41 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 ; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v0, v2, s[4:5] +; GFX9-NEXT: global_store_short v0, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 ; GFX10-NEXT: v_sub_nc_u16 v2, v2, 64 -; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_short v0, v2, s[4:5] +; GFX10-NEXT: global_store_short v0, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1635,7 +1635,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_64_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1654,7 +1654,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_64_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1676,7 +1676,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1696,7 +1696,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1718,29 +1718,29 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_64_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1763,7 +1763,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1782,7 +1782,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1804,7 +1804,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1824,7 +1824,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1846,42 +1846,42 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x400007 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x400007 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x400007 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x400007 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_7_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1904,7 +1904,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1923,7 +1923,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1945,7 +1945,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff85 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1965,7 +1965,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7b ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1987,42 +1987,42 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x7b0040 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x7b0040 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7b0040 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_123: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_64_123: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2046,7 +2046,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_7_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2064,7 +2064,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_7_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2103,7 +2103,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2125,29 +2125,29 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v1, 7 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 7 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_7_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2171,7 +2171,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2187,7 +2187,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2208,7 +2208,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2227,7 +2227,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2248,29 +2248,29 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2293,7 +2293,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2309,7 +2309,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2330,7 +2330,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2349,7 +2349,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2370,42 +2370,42 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_brev_b32 s0, 35 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_brev_b32 s2, 35 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 35 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0xc4000000 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2428,7 +2428,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2444,7 +2444,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2465,7 +2465,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2484,7 +2484,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2505,42 +2505,42 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_brev_b32 s0, 34 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_brev_b32 s2, 34 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 34 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x44000000 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2564,7 +2564,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2583,7 +2583,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2605,7 +2605,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 32 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -2625,7 +2625,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_not_b32_e32 v4, 31 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2647,29 +2647,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2692,7 +2692,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2708,7 +2708,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2729,7 +2729,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2748,7 +2748,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2769,29 +2769,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2814,7 +2814,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2832,7 +2832,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2852,7 +2852,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2871,7 +2871,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2893,29 +2893,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2939,7 +2939,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2958,7 +2958,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2980,7 +2980,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, -16 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3000,7 +3000,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, -16 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3022,29 +3022,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3067,7 +3067,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3083,7 +3083,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3104,7 +3104,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3123,7 +3123,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3144,29 +3144,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3189,7 +3189,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3207,7 +3207,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3227,7 +3227,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3246,7 +3246,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3268,29 +3268,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3313,7 +3313,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3332,7 +3332,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3354,7 +3354,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3374,7 +3374,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3396,53 +3396,53 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_movk_i32 s0, 0xc400 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_movk_i32 s2, 0xc400 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc400c400 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3455,7 +3455,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3478,7 +3478,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3497,7 +3497,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3519,7 +3519,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3539,7 +3539,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3561,53 +3561,53 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_movk_i32 s0, 0x4400 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_movk_i32 s2, 0x4400 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0] -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x44004400 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1] -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3620,7 +3620,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3643,7 +3643,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3662,7 +3662,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3684,7 +3684,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3704,7 +3704,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3726,29 +3726,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3771,7 +3771,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3790,7 +3790,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3812,7 +3812,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3832,7 +3832,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3854,29 +3854,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3899,7 +3899,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3916,7 +3916,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3935,7 +3935,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3953,7 +3953,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3975,29 +3975,29 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4020,7 +4020,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4037,7 +4037,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -4054,7 +4054,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4071,7 +4071,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4093,52 +4093,52 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_not_b32_e32 v2, 31 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xffffffe0, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4151,7 +4151,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll index 9f3596359a6625..105f201e318468 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: break_inserted_outside_of_loop: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s0, v0 @@ -14,13 +14,13 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: .LBB0_1: ; %ENDIF ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_and_b64 s[4:5], exec, vcc -; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; SI-NEXT: s_and_b64 s[2:3], exec, vcc +; SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB0_1 ; SI-NEXT: ; %bb.2: ; %ENDLOOP ; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -30,7 +30,7 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, ; ; FLAT-LABEL: break_inserted_outside_of_loop: ; FLAT: ; %bb.0: ; %main_body -; FLAT-NEXT: s_load_dword s0, s[2:3], 0x2c +; FLAT-NEXT: s_load_dword s0, s[4:5], 0x2c ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_and_b32_e32 v0, s0, v0 @@ -39,13 +39,13 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, ; FLAT-NEXT: s_mov_b64 s[0:1], 0 ; FLAT-NEXT: .LBB0_1: ; %ENDIF ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLAT-NEXT: s_and_b64 s[4:5], exec, vcc -; FLAT-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; FLAT-NEXT: s_and_b64 s[2:3], exec, vcc +; FLAT-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; FLAT-NEXT: s_andn2_b64 exec, exec, s[0:1] ; FLAT-NEXT: s_cbranch_execnz .LBB0_1 ; FLAT-NEXT: ; %bb.2: ; %ENDLOOP ; FLAT-NEXT: s_or_b64 exec, exec, s[0:1] -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: v_mov_b32_e32 v0, 0 @@ -72,21 +72,21 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; SI-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_mov_b64 s[0:1], 0 -; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SI-NEXT: s_cbranch_execz .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s2, s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: s_and_b64 s[4:5], s[2:3], exec +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec ; SI-NEXT: .LBB1_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: .LBB1_3: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_and_b64 s[2:3], exec, s[4:5] -; SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; SI-NEXT: s_and_b64 s[4:5], exec, s[2:3] +; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: ; %bb.4: ; %exit @@ -97,21 +97,21 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; FLAT-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; FLAT-NEXT: s_mov_b64 s[0:1], 0 -; FLAT-NEXT: s_mov_b64 s[4:5], 0 +; FLAT-NEXT: s_mov_b64 s[2:3], 0 ; FLAT-NEXT: s_and_saveexec_b64 s[6:7], vcc ; FLAT-NEXT: s_cbranch_execz .LBB1_2 ; FLAT-NEXT: ; %bb.1: ; %else -; FLAT-NEXT: s_load_dword s2, s[2:3], 0x24 +; FLAT-NEXT: s_load_dword s2, s[4:5], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: s_cmp_eq_u32 s2, 0 ; FLAT-NEXT: s_cselect_b64 s[2:3], -1, 0 -; FLAT-NEXT: s_and_b64 s[4:5], s[2:3], exec +; FLAT-NEXT: s_and_b64 s[2:3], s[2:3], exec ; FLAT-NEXT: .LBB1_2: ; %endif ; FLAT-NEXT: s_or_b64 exec, exec, s[6:7] ; FLAT-NEXT: .LBB1_3: ; %loop ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLAT-NEXT: s_and_b64 s[2:3], exec, s[4:5] -; FLAT-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; FLAT-NEXT: s_and_b64 s[4:5], exec, s[2:3] +; FLAT-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] ; FLAT-NEXT: s_andn2_b64 exec, exec, s[0:1] ; FLAT-NEXT: s_cbranch_execnz .LBB1_3 ; FLAT-NEXT: ; %bb.4: ; %exit @@ -166,12 +166,12 @@ declare float @llvm.fabs.f32(float) nounwind readnone define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { ; SI-LABEL: loop_land_info_assert: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xa +; SI-NEXT: s_load_dword s0, s[4:5], 0xa ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lt_i32 s0, 4 ; SI-NEXT: s_cbranch_scc1 .LBB3_4 ; SI-NEXT: ; %bb.1: ; %for.cond.preheader -; SI-NEXT: s_load_dword s0, s[2:3], 0xc +; SI-NEXT: s_load_dword s0, s[4:5], 0xc ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmpk_lt_i32 s0, 0x3e8 ; SI-NEXT: s_cbranch_scc0 .LBB3_4 @@ -186,12 +186,12 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 ; ; FLAT-LABEL: loop_land_info_assert: ; FLAT: ; %bb.0: ; %entry -; FLAT-NEXT: s_load_dword s0, s[2:3], 0x28 +; FLAT-NEXT: s_load_dword s0, s[4:5], 0x28 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: s_cmp_lt_i32 s0, 4 ; FLAT-NEXT: s_cbranch_scc1 .LBB3_4 ; FLAT-NEXT: ; %bb.1: ; %for.cond.preheader -; FLAT-NEXT: s_load_dword s0, s[2:3], 0x30 +; FLAT-NEXT: s_load_dword s0, s[4:5], 0x30 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: s_cmpk_lt_i32 s0, 0x3e8 ; FLAT-NEXT: s_cbranch_scc0 .LBB3_4 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll index e64dcb74267dd9..e5047cfa0b4e91 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @test(i32 %arg, i32 %arg1) { ; CHECK-LABEL: test: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_eq_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index 1d183210f95380..2dfb72a08cffc5 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -8,43 +8,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; This used to bypass the structurization process because structurizer is unable to ; handle multiple-exits CFG. This should be correctly structurized. -; UNIFY-LABEL: define amdgpu_kernel void @kernel -; UNIFY-LABEL: entry: -; UNIFY: %tid = call i32 @llvm.amdgcn.workitem.id.x() -; UNIFY-NEXT: %cmp = icmp eq i32 %n.load, 256 -; UNIFY-NEXT: br i1 %cmp, label %if.then, label %if.else -; UNIFY-LABEL: if.then: -; UNIFY-NEXT: %cmp1 = icmp eq i32 %a.load, 0 -; UNIFY-NEXT: br i1 %cmp1, label %if.end6.sink.split, label %cond.false -; UNIFY-LABEL: cond.false: -; UNIFY-NEXT: call void @llvm.trap() -; UNIFY-NEXT: br label %UnifiedUnreachableBlock -; UNIFY-LABEL: if.else: -; UNIFY-NEXT: %cmp2 = icmp ult i32 %tid, 10 -; UNIFY-NEXT: br i1 %cmp2, label %if.then3, label %UnifiedReturnBlock -; UNIFY-LABEL: if.then3: -; UNIFY-NEXT: %cmp1.i7 = icmp eq i32 %a.load, 0 -; UNIFY-NEXT: br i1 %cmp1.i7, label %if.end6.sink.split, label %cond.false.i8 -; UNIFY-LABEL: cond.false.i8: -; UNIFY-NEXT: call void @llvm.trap() -; UNIFY-NEXT: br label %UnifiedUnreachableBlock -; UNIFY-LABEL: if.end6.sink.split: -; UNIFY-NEXT: %x.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %kernel.kernarg.segment, i64 8 -; UNIFY-NEXT: %x.load = load ptr addrspace(1), ptr addrspace(4) %x.kernarg.offset, align 8, !invariant.load !0 -; UNIFY-NEXT: %idxprom = sext i32 %tid to i64 -; UNIFY-NEXT: %x1 = getelementptr inbounds i32, ptr addrspace(1) %x.load, i64 %idxprom -; UNIFY-NEXT: store i32 %a.load, ptr addrspace(1) %x1, align 4 -; UNIFY-NEXT: br label %UnifiedReturnBlock -; UNIFY-LABEL: UnifiedUnreachableBlock: -; UNIFY-NEXT: call void @llvm.amdgcn.unreachable() -; UNIFY-NEXT: br label %UnifiedReturnBlock -; UNIFY-LABEL: UnifiedReturnBlock: -; UNIFY-NEXT: ret void - ; CHECK-LABEL: kernel: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x10 -; CHECK-NEXT: s_load_dword s10, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x10 +; CHECK-NEXT: s_load_dword s10, s[8:9], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmpk_lg_i32 s0, 0x100 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 @@ -53,7 +20,7 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: s_cbranch_execz .LBB0_5 ; CHECK-NEXT: ; %bb.2: ; %if.then3 ; CHECK-NEXT: s_cmp_lg_u32 s10, 0 @@ -64,7 +31,7 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CHECK-NEXT: s_and_b64 s[2:3], s[2:3], exec ; CHECK-NEXT: .LBB0_5: ; %Flow2 -; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccz .LBB0_8 ; CHECK-NEXT: s_branch .LBB0_7 @@ -85,7 +52,7 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; CHECK-NEXT: s_cbranch_execz .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; %if.end6.sink.split -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: v_mov_b32_e32 v1, s10 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -104,6 +71,7 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 + entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %n, 256 @@ -137,3 +105,5 @@ if.end6.sink.split: if.end6: ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; UNIFY: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index 5f1e3bd9a9fe1a..af78768520d23f 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,7 +19,7 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,32 +39,32 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: test_s_sext_i32_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 -; SI-NEXT: s_add_i32 s4, s4, s6 -; SI-NEXT: s_ashr_i32 s5, s4, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mul_i32 s0, s0, s1 +; SI-NEXT: s_add_i32 s0, s0, s2 +; SI-NEXT: s_ashr_i32 s1, s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_s_sext_i32_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: s_add_i32 s4, s4, s6 -; VI-NEXT: s_ashr_i32 s5, s4, 31 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mul_i32 s0, s0, s1 +; VI-NEXT: s_add_i32 s0, s0, s2 +; VI-NEXT: s_ashr_i32 s1, s0, 31 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm entry: %mul = mul i32 %a, %b @@ -77,7 +77,7 @@ entry: define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,7 +92,7 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,27 +113,27 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s5, s4, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_ashr_i32 s4, s6, 31 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s5, s4, 31 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_ashr_i32 s4, s6, 31 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %sext = sext i32 %a to i64 @@ -144,7 +144,7 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -162,7 +162,7 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -186,12 +186,12 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) nounwind { ; SI-LABEL: s_sext_i16_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; SI-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x100000 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -199,12 +199,12 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun ; ; VI-LABEL: s_sext_i16_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; VI-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x100000 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -217,7 +217,7 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -231,7 +231,7 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -255,34 +255,34 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { ; SI-LABEL: s_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s4, s5 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_cmp_eq_u32 s6, s7 -; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s2, s3 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s4, s5 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s6, s7 -; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; VI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_cmp_eq_u32 s0, s1 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s2, s3 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp0 = icmp eq i32 %a, %b %cmp1 = icmp eq i32 %c, %d @@ -295,32 +295,32 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: v_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s5, s6 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_cmp_eq_u32 s1, s2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s5, s6 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_cmp_eq_u32 s1, s2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %cmp0 = icmp eq i32 %a, %tid @@ -342,50 +342,50 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s5, s4, 24 -; SI-NEXT: s_bfe_i32 s6, s4, 0x80010 -; SI-NEXT: s_bfe_i32 s7, s4, 0x80008 -; SI-NEXT: s_sext_i32_i8 s4, s4 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_ashr_i32 s4, s6, 24 +; SI-NEXT: s_bfe_i32 s5, s6, 0x80010 +; SI-NEXT: s_bfe_i32 s7, s6, 0x80008 +; SI-NEXT: s_sext_i32_i8 s6, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s5, s4, 24 -; VI-NEXT: s_bfe_i32 s6, s4, 0x80010 -; VI-NEXT: s_bfe_i32 s7, s4, 0x80008 -; VI-NEXT: s_sext_i32_i8 s4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_ashr_i32 s4, s6, 24 +; VI-NEXT: s_bfe_i32 s5, s6, 0x80010 +; VI-NEXT: s_bfe_i32 s7, s6, 0x80008 +; VI-NEXT: s_sext_i32_i8 s6, s6 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm @@ -407,7 +407,7 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -435,7 +435,7 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: v_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -478,7 +478,7 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) nounwind { ; SI-LABEL: s_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -504,7 +504,7 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) ; ; VI-LABEL: s_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -543,7 +543,7 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -571,7 +571,7 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index cca7b49996ff3b..894ab4fde7da08 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -41,30 +41,31 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; ; GFX9-LABEL: test_simple_indirect_call: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x4 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX9-NEXT: s_mov_b32 s13, s15 +; GFX9-NEXT: s_mov_b32 s12, s14 +; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x4 +; GFX9-NEXT: s_add_u32 s0, s0, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s8, s8, 16 -; GFX9-NEXT: s_mul_i32 s8, s8, s9 -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v0 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, indirect@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, indirect@rel32@hi+12 -; GFX9-NEXT: v_mad_u32_u24 v3, v1, s9, v3 +; GFX9-NEXT: s_lshr_b32 s14, s14, 16 +; GFX9-NEXT: s_mul_i32 s14, s14, s15 +; GFX9-NEXT: v_mul_lo_u32 v3, s14, v0 +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, indirect@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, indirect@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s14, s16 +; GFX9-NEXT: v_mad_u32_u24 v3, v1, s15, v3 ; GFX9-NEXT: v_add_lshl_u32 v5, v3, v2, 3 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s18 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s19 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-NEXT: ds_write_b64 v5, v[3:4] -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: s_endpgm %fptr = alloca ptr, addrspace(5) %fptr.cast = addrspacecast ptr addrspace(5) %fptr to ptr diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index b8721129222043..6b40df0345ebe3 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -7,8 +7,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: sint_to_fp_i32_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -18,8 +18,8 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: sint_to_fp_i32_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -36,8 +36,8 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: sint_to_fp_i1_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -50,8 +50,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: sint_to_fp_i1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -70,8 +70,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) { ; CI-LABEL: sint_to_fp_i1_f64_load: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitcmp1_b32 s2, 0 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -84,8 +84,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; ; VI-LABEL: sint_to_fp_i1_f64_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: s_sint_to_fp_i64_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; CI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -116,7 +116,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; ; VI-LABEL: s_sint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -134,7 +134,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CI-LABEL: v_sint_to_fp_i64_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -153,7 +153,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_sint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -181,8 +181,8 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) { ; CI-LABEL: s_sint_to_fp_i8_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 @@ -193,8 +193,8 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; ; VI-LABEL: s_sint_to_fp_i8_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i32 s2, s2, 0x80000 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -230,8 +230,8 @@ define double @v_sint_to_fp_i8_to_f64(i8 %in) { define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: s_select_sint_to_fp_i1_vals_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -244,8 +244,8 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_sint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -281,8 +281,8 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: s_select_sint_to_fp_i1_vals_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -295,8 +295,8 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_sint_to_fp_i1_vals_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -351,8 +351,8 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: s_swap_select_sint_to_fp_i1_vals_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -365,8 +365,8 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; ; VI-LABEL: s_swap_select_sint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll index ab00d7d33bb9c7..a83ed902f1c9df 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_sint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -32,7 +32,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_sint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s5, s2, s3 ; GFX8-NEXT: s_flbit_i32 s4, s3 @@ -54,7 +54,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_sint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s4, s2, s3 @@ -85,7 +85,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -114,7 +114,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_sint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -144,7 +144,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_sint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 @@ -184,7 +184,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_sint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -207,7 +207,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_sint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s5, s2, s3 ; GFX8-NEXT: s_flbit_i32 s4, s3 @@ -228,7 +228,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_sint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s4, s2, s3 @@ -257,7 +257,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -285,7 +285,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_sint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -314,7 +314,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_sint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 @@ -353,105 +353,105 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_flbit_i32 s8, s7 -; GFX6-NEXT: s_xor_b32 s9, s6, s7 -; GFX6-NEXT: s_flbit_i32 s10, s5 -; GFX6-NEXT: s_xor_b32 s11, s4, s5 -; GFX6-NEXT: s_add_i32 s8, s8, -1 -; GFX6-NEXT: s_ashr_i32 s9, s9, 31 -; GFX6-NEXT: s_add_i32 s10, s10, -1 -; GFX6-NEXT: s_ashr_i32 s11, s11, 31 -; GFX6-NEXT: s_add_i32 s9, s9, 32 -; GFX6-NEXT: s_add_i32 s11, s11, 32 -; GFX6-NEXT: s_min_u32 s8, s8, s9 -; GFX6-NEXT: s_min_u32 s9, s10, s11 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; GFX6-NEXT: s_sub_i32 s8, 32, s8 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 -; GFX6-NEXT: s_sub_i32 s9, 32, s9 -; GFX6-NEXT: s_min_u32 s6, s6, 1 +; GFX6-NEXT: s_flbit_i32 s4, s11 +; GFX6-NEXT: s_xor_b32 s5, s10, s11 +; GFX6-NEXT: s_flbit_i32 s6, s9 +; GFX6-NEXT: s_xor_b32 s7, s8, s9 +; GFX6-NEXT: s_add_i32 s4, s4, -1 +; GFX6-NEXT: s_ashr_i32 s5, s5, 31 +; GFX6-NEXT: s_add_i32 s6, s6, -1 +; GFX6-NEXT: s_ashr_i32 s7, s7, 31 +; GFX6-NEXT: s_add_i32 s5, s5, 32 +; GFX6-NEXT: s_add_i32 s7, s7, 32 +; GFX6-NEXT: s_min_u32 s12, s4, s5 +; GFX6-NEXT: s_min_u32 s13, s6, s7 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s12 +; GFX6-NEXT: s_sub_i32 s10, 32, s12 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s13 +; GFX6-NEXT: s_sub_i32 s8, 32, s13 ; GFX6-NEXT: s_min_u32 s4, s4, 1 -; GFX6-NEXT: s_or_b32 s6, s7, s6 +; GFX6-NEXT: s_min_u32 s6, s6, 1 ; GFX6-NEXT: s_or_b32 s4, s5, s4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s8 -; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s9 +; GFX6-NEXT: s_or_b32 s5, s7, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s10 +; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s8 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s3, s6, s7 -; GFX8-NEXT: s_flbit_i32 s2, s7 -; GFX8-NEXT: s_ashr_i32 s3, s3, 31 -; GFX8-NEXT: s_add_i32 s2, s2, -1 -; GFX8-NEXT: s_add_i32 s3, s3, 32 -; GFX8-NEXT: s_min_u32 s9, s2, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9 +; GFX8-NEXT: s_xor_b32 s7, s2, s3 +; GFX8-NEXT: s_flbit_i32 s6, s3 +; GFX8-NEXT: s_ashr_i32 s7, s7, 31 +; GFX8-NEXT: s_add_i32 s6, s6, -1 +; GFX8-NEXT: s_add_i32 s7, s7, 32 +; GFX8-NEXT: s_min_u32 s6, s6, s7 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 ; GFX8-NEXT: s_min_u32 s2, s2, 1 ; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX8-NEXT: s_xor_b32 s2, s4, s5 -; GFX8-NEXT: s_flbit_i32 s8, s5 +; GFX8-NEXT: s_xor_b32 s2, s0, s1 +; GFX8-NEXT: s_flbit_i32 s8, s1 ; GFX8-NEXT: s_ashr_i32 s2, s2, 31 ; GFX8-NEXT: s_add_i32 s8, s8, -1 ; GFX8-NEXT: s_add_i32 s2, s2, 32 -; GFX8-NEXT: s_min_u32 s6, s8, s2 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6 -; GFX8-NEXT: s_min_u32 s2, s2, 1 -; GFX8-NEXT: s_or_b32 s2, s3, s2 -; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GFX8-NEXT: s_sub_i32 s2, 32, s9 -; GFX8-NEXT: v_ldexp_f32 v1, v0, s2 -; GFX8-NEXT: s_sub_i32 s2, 32, s6 -; GFX8-NEXT: v_ldexp_f32 v0, v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_min_u32 s2, s8, s2 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX8-NEXT: s_min_u32 s0, s0, 1 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GFX8-NEXT: s_sub_i32 s0, 32, s6 +; GFX8-NEXT: v_ldexp_f32 v1, v0, s0 +; GFX8-NEXT: s_sub_i32 s0, 32, s2 +; GFX8-NEXT: v_ldexp_f32 v0, v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s3, s6, s7 -; GFX11-NEXT: s_xor_b32 s9, s4, s5 -; GFX11-NEXT: s_cls_i32 s2, s7 -; GFX11-NEXT: s_cls_i32 s8, s5 -; GFX11-NEXT: s_ashr_i32 s3, s3, 31 +; GFX11-NEXT: s_xor_b32 s7, s2, s3 +; GFX11-NEXT: s_xor_b32 s9, s0, s1 +; GFX11-NEXT: s_cls_i32 s6, s3 +; GFX11-NEXT: s_cls_i32 s8, s1 +; GFX11-NEXT: s_ashr_i32 s7, s7, 31 ; GFX11-NEXT: s_ashr_i32 s9, s9, 31 -; GFX11-NEXT: s_add_i32 s2, s2, -1 +; GFX11-NEXT: s_add_i32 s6, s6, -1 ; GFX11-NEXT: s_add_i32 s8, s8, -1 -; GFX11-NEXT: s_add_i32 s3, s3, 32 +; GFX11-NEXT: s_add_i32 s7, s7, 32 ; GFX11-NEXT: s_add_i32 s9, s9, 32 -; GFX11-NEXT: s_min_u32 s10, s2, s3 -; GFX11-NEXT: s_min_u32 s8, s8, s9 -; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s10 -; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 +; GFX11-NEXT: s_min_u32 s6, s6, s7 +; GFX11-NEXT: s_min_u32 s7, s8, s9 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 ; GFX11-NEXT: s_min_u32 s2, s2, 1 -; GFX11-NEXT: s_min_u32 s4, s4, 1 +; GFX11-NEXT: s_min_u32 s0, s0, 1 ; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_or_b32 s0, s1, s0 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX11-NEXT: s_sub_i32 s2, 32, s10 -; GFX11-NEXT: s_sub_i32 s3, 32, s8 +; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GFX11-NEXT: s_sub_i32 s0, 32, s6 +; GFX11-NEXT: s_sub_i32 s1, 32, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f32 v1, v0, s2 -; GFX11-NEXT: v_ldexp_f32 v0, v2, s3 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: v_ldexp_f32 v1, v0, s0 +; GFX11-NEXT: v_ldexp_f32 v0, v2, s1 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm %result = sitofp <2 x i64> %in to <2 x float> store <2 x float> %result, ptr addrspace(1) %out @@ -461,7 +461,7 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -528,7 +528,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -597,7 +597,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 @@ -674,35 +674,35 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_flbit_i32 s8, s7 -; GFX6-NEXT: s_xor_b32 s9, s6, s7 -; GFX6-NEXT: s_flbit_i32 s10, s5 -; GFX6-NEXT: s_xor_b32 s11, s4, s5 -; GFX6-NEXT: s_add_i32 s8, s8, -1 -; GFX6-NEXT: s_ashr_i32 s9, s9, 31 -; GFX6-NEXT: s_add_i32 s10, s10, -1 -; GFX6-NEXT: s_ashr_i32 s11, s11, 31 -; GFX6-NEXT: s_add_i32 s9, s9, 32 -; GFX6-NEXT: s_add_i32 s11, s11, 32 -; GFX6-NEXT: s_min_u32 s8, s8, s9 -; GFX6-NEXT: s_min_u32 s9, s10, s11 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; GFX6-NEXT: s_sub_i32 s8, 32, s8 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 -; GFX6-NEXT: s_sub_i32 s9, 32, s9 -; GFX6-NEXT: s_min_u32 s6, s6, 1 +; GFX6-NEXT: s_flbit_i32 s4, s11 +; GFX6-NEXT: s_xor_b32 s5, s10, s11 +; GFX6-NEXT: s_flbit_i32 s6, s9 +; GFX6-NEXT: s_xor_b32 s7, s8, s9 +; GFX6-NEXT: s_add_i32 s4, s4, -1 +; GFX6-NEXT: s_ashr_i32 s5, s5, 31 +; GFX6-NEXT: s_add_i32 s6, s6, -1 +; GFX6-NEXT: s_ashr_i32 s7, s7, 31 +; GFX6-NEXT: s_add_i32 s5, s5, 32 +; GFX6-NEXT: s_add_i32 s7, s7, 32 +; GFX6-NEXT: s_min_u32 s12, s4, s5 +; GFX6-NEXT: s_min_u32 s13, s6, s7 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s12 +; GFX6-NEXT: s_sub_i32 s10, 32, s12 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s13 +; GFX6-NEXT: s_sub_i32 s8, 32, s13 ; GFX6-NEXT: s_min_u32 s4, s4, 1 -; GFX6-NEXT: s_or_b32 s6, s7, s6 +; GFX6-NEXT: s_min_u32 s6, s6, 1 ; GFX6-NEXT: s_or_b32 s4, s5, s4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s8 -; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s9 +; GFX6-NEXT: s_or_b32 s5, s7, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s10 +; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s8 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -712,79 +712,79 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s3, s6, s7 -; GFX8-NEXT: s_flbit_i32 s2, s7 -; GFX8-NEXT: s_ashr_i32 s3, s3, 31 -; GFX8-NEXT: s_add_i32 s2, s2, -1 -; GFX8-NEXT: s_add_i32 s3, s3, 32 -; GFX8-NEXT: s_min_u32 s8, s2, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8 +; GFX8-NEXT: s_xor_b32 s7, s2, s3 +; GFX8-NEXT: s_flbit_i32 s6, s3 +; GFX8-NEXT: s_ashr_i32 s7, s7, 31 +; GFX8-NEXT: s_add_i32 s6, s6, -1 +; GFX8-NEXT: s_add_i32 s7, s7, 32 +; GFX8-NEXT: s_min_u32 s6, s6, s7 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 ; GFX8-NEXT: s_min_u32 s2, s2, 1 ; GFX8-NEXT: s_or_b32 s2, s3, s2 -; GFX8-NEXT: s_xor_b32 s3, s4, s5 +; GFX8-NEXT: s_xor_b32 s3, s0, s1 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX8-NEXT: s_flbit_i32 s2, s5 +; GFX8-NEXT: s_flbit_i32 s2, s1 ; GFX8-NEXT: s_ashr_i32 s3, s3, 31 ; GFX8-NEXT: s_add_i32 s2, s2, -1 ; GFX8-NEXT: s_add_i32 s3, s3, 32 -; GFX8-NEXT: s_min_u32 s7, s2, s3 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s7 -; GFX8-NEXT: s_min_u32 s2, s2, 1 -; GFX8-NEXT: s_or_b32 s2, s3, s2 -; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GFX8-NEXT: s_sub_i32 s6, 32, s8 -; GFX8-NEXT: s_sub_i32 s2, 32, s7 +; GFX8-NEXT: s_min_u32 s2, s2, s3 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX8-NEXT: s_min_u32 s0, s0, 1 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX8-NEXT: s_sub_i32 s6, 32, s6 +; GFX8-NEXT: s_sub_i32 s0, 32, s2 ; GFX8-NEXT: v_ldexp_f32 v0, v0, s6 -; GFX8-NEXT: v_ldexp_f32 v1, v1, s2 +; GFX8-NEXT: v_ldexp_f32 v1, v1, s0 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s3, s6, s7 -; GFX11-NEXT: s_xor_b32 s9, s4, s5 -; GFX11-NEXT: s_cls_i32 s2, s7 -; GFX11-NEXT: s_cls_i32 s8, s5 -; GFX11-NEXT: s_ashr_i32 s3, s3, 31 +; GFX11-NEXT: s_xor_b32 s7, s2, s3 +; GFX11-NEXT: s_xor_b32 s9, s0, s1 +; GFX11-NEXT: s_cls_i32 s6, s3 +; GFX11-NEXT: s_cls_i32 s8, s1 +; GFX11-NEXT: s_ashr_i32 s7, s7, 31 ; GFX11-NEXT: s_ashr_i32 s9, s9, 31 -; GFX11-NEXT: s_add_i32 s2, s2, -1 +; GFX11-NEXT: s_add_i32 s6, s6, -1 ; GFX11-NEXT: s_add_i32 s8, s8, -1 -; GFX11-NEXT: s_add_i32 s3, s3, 32 +; GFX11-NEXT: s_add_i32 s7, s7, 32 ; GFX11-NEXT: s_add_i32 s9, s9, 32 -; GFX11-NEXT: s_min_u32 s10, s2, s3 -; GFX11-NEXT: s_min_u32 s8, s8, s9 -; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s10 -; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 +; GFX11-NEXT: s_min_u32 s6, s6, s7 +; GFX11-NEXT: s_min_u32 s7, s8, s9 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 ; GFX11-NEXT: s_min_u32 s2, s2, 1 -; GFX11-NEXT: s_min_u32 s4, s4, 1 +; GFX11-NEXT: s_min_u32 s0, s0, 1 ; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_or_b32 s0, s1, s0 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX11-NEXT: s_sub_i32 s2, 32, s10 -; GFX11-NEXT: s_sub_i32 s3, 32, s8 +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX11-NEXT: s_sub_i32 s0, 32, s6 +; GFX11-NEXT: s_sub_i32 s1, 32, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 -; GFX11-NEXT: v_ldexp_f32 v1, v1, s3 +; GFX11-NEXT: v_ldexp_f32 v0, v0, s0 +; GFX11-NEXT: v_ldexp_f32 v1, v1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-NEXT: s_endpgm %result = sitofp <2 x i64> %in to <2 x half> store <2 x half> %result, ptr addrspace(1) %out @@ -794,7 +794,7 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -869,7 +869,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -944,7 +944,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index 8d34c7bb14ed49..cda4c085cd25a6 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; SI-LABEL: sitofp_i16_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; ; VI-LABEL: sitofp_i16_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -44,7 +44,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; ; GFX11-TRUE16-LABEL: sitofp_i16_to_f16: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -62,7 +62,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; ; GFX11-FAKE16-LABEL: sitofp_i16_to_f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -89,7 +89,7 @@ entry: define amdgpu_kernel void @sitofp_i32_to_f16( ; SI-LABEL: sitofp_i32_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -108,7 +108,7 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; ; VI-LABEL: sitofp_i32_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -127,7 +127,7 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; ; GFX11-TRUE16-LABEL: sitofp_i32_to_f16: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -147,7 +147,7 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; ; GFX11-FAKE16-LABEL: sitofp_i32_to_f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -178,7 +178,7 @@ entry: define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; SI-LABEL: sitofp_v2i16_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -203,7 +203,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; VI-LABEL: sitofp_v2i16_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -223,7 +223,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX11-TRUE16-LABEL: sitofp_v2i16_to_v2f16: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -248,7 +248,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX11-FAKE16-LABEL: sitofp_v2i16_to_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -279,7 +279,7 @@ entry: define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; SI-LABEL: sitofp_v2i32_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -302,7 +302,7 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; ; VI-LABEL: sitofp_v2i32_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -324,7 +324,7 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; ; GFX11-TRUE16-LABEL: sitofp_v2i32_to_v2f16: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -351,7 +351,7 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; ; GFX11-FAKE16-LABEL: sitofp_v2i32_to_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -384,21 +384,19 @@ entry: define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_sint_to_fp_i1_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -406,26 +404,26 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sint_to_fp_i1_to_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s10 +; VI-NEXT: s_mov_b32 s13, s11 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -433,27 +431,29 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s8 +; VI-NEXT: s_mov_b32 s1, s9 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: s_sint_to_fp_i1_to_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, s10 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, s11 -; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 -; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s7 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 -; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s8, s4 -; GFX11-TRUE16-NEXT: s_mov_b32 s9, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -463,27 +463,27 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: s_sint_to_fp_i1_to_f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-FAKE16-NEXT: s_mov_b32 s2, s10 -; GFX11-FAKE16-NEXT: s_mov_b32 s3, s11 -; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 -; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s7 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_mov_b32 s12, s6 -; GFX11-FAKE16-NEXT: s_mov_b32 s13, s7 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s11 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-FAKE16-NEXT: s_mov_b32 s8, s4 -; GFX11-FAKE16-NEXT: s_mov_b32 s9, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -493,7 +493,7 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll index fbb9ba0b73846e..ae166212fe79de 100644 --- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll +++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll @@ -5,7 +5,7 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; GFX940-LABEL: test: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 3446e0384cc545..71033cfd1a6f34 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -7,34 +7,34 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0 { ; GFX9-LABEL: s_abs_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, 0, s4 -; GFX9-NEXT: v_pk_max_i16 v1, s4, v1 +; GFX9-NEXT: v_pk_sub_i16 v1, 0, s2 +; GFX9-NEXT: v_pk_max_i16 v1, s2, v1 ; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_abs_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: s_sub_i32 s3, 0, s4 -; VI-NEXT: s_ashr_i32 s5, s4, 16 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_sub_i32 s4, 0, s2 +; VI-NEXT: s_ashr_i32 s5, s2, 16 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_sub_i32 s3, 0, s3 ; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_sub_i32 s2, 0, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_max_i32 s3, s4, s3 -; VI-NEXT: s_max_i32 s2, s5, s2 -; VI-NEXT: s_add_i32 s3, s3, 2 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_max_i32 s2, s2, s4 +; VI-NEXT: s_max_i32 s3, s5, s3 +; VI-NEXT: s_add_i32 s2, s2, 2 +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_or_b32 s2, s3, s2 ; VI-NEXT: s_add_i32 s2, s2, 0x20000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -44,24 +44,24 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0 ; ; CI-LABEL: s_abs_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s6, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_ashr_i32 s5, s4, 16 -; CI-NEXT: s_lshr_b32 s6, s4, 16 -; CI-NEXT: s_sext_i32_i16 s7, s4 -; CI-NEXT: s_sub_i32 s4, 0, s4 -; CI-NEXT: s_sext_i32_i16 s4, s4 +; CI-NEXT: s_lshr_b32 s5, s6, 16 +; CI-NEXT: s_ashr_i32 s4, s6, 16 +; CI-NEXT: s_sext_i32_i16 s7, s6 ; CI-NEXT: s_sub_i32 s6, 0, s6 +; CI-NEXT: s_sub_i32 s5, 0, s5 ; CI-NEXT: s_sext_i32_i16 s6, s6 -; CI-NEXT: s_max_i32 s4, s7, s4 -; CI-NEXT: s_max_i32 s5, s5, s6 -; CI-NEXT: s_add_i32 s4, s4, 2 -; CI-NEXT: s_lshl_b32 s5, s5, 16 -; CI-NEXT: s_and_b32 s4, s4, 0xffff -; CI-NEXT: s_or_b32 s4, s5, s4 +; CI-NEXT: s_sext_i32_i16 s5, s5 +; CI-NEXT: s_max_i32 s4, s4, s5 +; CI-NEXT: s_max_i32 s5, s7, s6 +; CI-NEXT: s_add_i32 s5, s5, 2 +; CI-NEXT: s_lshl_b32 s4, s4, 16 +; CI-NEXT: s_and_b32 s5, s5, 0xffff +; CI-NEXT: s_or_b32 s4, s4, s5 ; CI-NEXT: s_add_i32 s4, s4, 0x20000 ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -77,20 +77,20 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0 define amdgpu_kernel void @v_abs_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 { ; GFX9-LABEL: v_abs_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v2, 0, v1 ; GFX9-NEXT: v_pk_max_i16 v1, v1, v2 ; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_abs_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 2 @@ -115,7 +115,7 @@ define amdgpu_kernel void @v_abs_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_abs_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -156,34 +156,34 @@ define amdgpu_kernel void @v_abs_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val) #0 { ; GFX9-LABEL: s_abs_v2i16_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, 0, s4 -; GFX9-NEXT: v_pk_max_i16 v1, s4, v1 +; GFX9-NEXT: v_pk_sub_i16 v1, 0, s2 +; GFX9-NEXT: v_pk_max_i16 v1, s2, v1 ; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_abs_v2i16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: s_sub_i32 s3, 0, s4 -; VI-NEXT: s_ashr_i32 s5, s4, 16 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_sub_i32 s4, 0, s2 +; VI-NEXT: s_ashr_i32 s5, s2, 16 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_sub_i32 s3, 0, s3 ; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_sub_i32 s2, 0, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_max_i32 s3, s4, s3 -; VI-NEXT: s_max_i32 s2, s5, s2 -; VI-NEXT: s_add_i32 s3, s3, 2 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_max_i32 s2, s2, s4 +; VI-NEXT: s_max_i32 s3, s5, s3 +; VI-NEXT: s_add_i32 s2, s2, 2 +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_or_b32 s2, s3, s2 ; VI-NEXT: s_add_i32 s2, s2, 0x20000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -193,24 +193,24 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val) ; ; CI-LABEL: s_abs_v2i16_2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s6, s[4:5], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_ashr_i32 s5, s4, 16 -; CI-NEXT: s_lshr_b32 s6, s4, 16 -; CI-NEXT: s_sext_i32_i16 s7, s4 -; CI-NEXT: s_sub_i32 s4, 0, s4 -; CI-NEXT: s_sext_i32_i16 s4, s4 +; CI-NEXT: s_lshr_b32 s5, s6, 16 +; CI-NEXT: s_ashr_i32 s4, s6, 16 +; CI-NEXT: s_sext_i32_i16 s7, s6 ; CI-NEXT: s_sub_i32 s6, 0, s6 +; CI-NEXT: s_sub_i32 s5, 0, s5 ; CI-NEXT: s_sext_i32_i16 s6, s6 -; CI-NEXT: s_max_i32 s4, s7, s4 -; CI-NEXT: s_max_i32 s5, s5, s6 -; CI-NEXT: s_add_i32 s4, s4, 2 -; CI-NEXT: s_lshl_b32 s5, s5, 16 -; CI-NEXT: s_and_b32 s4, s4, 0xffff -; CI-NEXT: s_or_b32 s4, s5, s4 +; CI-NEXT: s_sext_i32_i16 s5, s5 +; CI-NEXT: s_max_i32 s4, s4, s5 +; CI-NEXT: s_max_i32 s5, s7, s6 +; CI-NEXT: s_add_i32 s5, s5, 2 +; CI-NEXT: s_lshl_b32 s4, s4, 16 +; CI-NEXT: s_and_b32 s5, s5, 0xffff +; CI-NEXT: s_or_b32 s4, s4, s5 ; CI-NEXT: s_add_i32 s4, s4, 0x20000 ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -230,21 +230,21 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val) define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 { ; GFX9-LABEL: v_abs_v2i16_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v2, 0, v0 ; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_abs_v2i16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: v_mov_b32_e32 v4, 2 @@ -268,7 +268,7 @@ define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_abs_v2i16_2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_mov_b32 s11, s7 @@ -315,21 +315,21 @@ define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 { ; GFX9-LABEL: s_abs_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, 0, s7 -; GFX9-NEXT: v_pk_sub_i16 v1, 0, s6 -; GFX9-NEXT: v_pk_max_i16 v3, s6, v1 -; GFX9-NEXT: v_pk_max_i16 v0, s7, v0 +; GFX9-NEXT: v_pk_sub_i16 v0, 0, s3 +; GFX9-NEXT: v_pk_sub_i16 v1, 0, s2 +; GFX9-NEXT: v_pk_max_i16 v3, s2, v1 +; GFX9-NEXT: v_pk_max_i16 v0, s3, v0 ; GFX9-NEXT: v_pk_add_u16 v1, v0, 2 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v3, 2 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_abs_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: s_lshr_b32 s5, s3, 16 @@ -368,7 +368,7 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 ; ; CI-LABEL: s_abs_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -427,11 +427,11 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 define amdgpu_kernel void @v_abs_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 { ; GFX9-LABEL: v_abs_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v3, 0, v1 ; GFX9-NEXT: v_pk_sub_i16 v4, 0, v0 @@ -439,12 +439,12 @@ define amdgpu_kernel void @v_abs_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_abs_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 2 @@ -475,7 +475,7 @@ define amdgpu_kernel void @v_abs_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_abs_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_mov_b32 s11, s3 @@ -540,43 +540,43 @@ define amdgpu_kernel void @v_abs_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %val0, <2 x i16> %val1) #0 { ; GFX9-LABEL: s_min_max_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_pk_max_i16 v2, s0, v1 -; GFX9-NEXT: v_pk_min_i16 v1, s0, v1 -; GFX9-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_pk_max_i16 v2, s6, v1 +; GFX9-NEXT: v_pk_min_i16 v1, s6, v1 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_min_max_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_ashr_i32 s2, s0, 16 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_ashr_i32 s3, s1, 16 -; VI-NEXT: s_sext_i32_i16 s1, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_max_i32 s4, s2, s3 -; VI-NEXT: s_max_i32 s5, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_ashr_i32 s0, s4, 16 +; VI-NEXT: s_sext_i32_i16 s1, s4 +; VI-NEXT: s_ashr_i32 s2, s5, 16 +; VI-NEXT: s_sext_i32_i16 s3, s5 +; VI-NEXT: s_max_i32 s4, s0, s2 +; VI-NEXT: s_max_i32 s5, s1, s3 ; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_min_i32 s2, s2, s3 -; VI-NEXT: s_min_i32 s0, s0, s1 +; VI-NEXT: s_min_i32 s0, s0, s2 +; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_lshl_b32 s1, s2, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -586,35 +586,35 @@ define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace ; ; CI-LABEL: s_min_max_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_ashr_i32 s4, s12, 16 -; CI-NEXT: s_ashr_i32 s6, s13, 16 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: s_mov_b32 s1, s7 -; CI-NEXT: s_sext_i32_i16 s5, s12 -; CI-NEXT: s_sext_i32_i16 s7, s13 -; CI-NEXT: s_max_i32 s12, s4, s6 -; CI-NEXT: s_max_i32 s13, s5, s7 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_ashr_i32 s10, s8, 16 +; CI-NEXT: s_ashr_i32 s11, s9, 16 +; CI-NEXT: s_sext_i32_i16 s8, s8 +; CI-NEXT: s_sext_i32_i16 s9, s9 +; CI-NEXT: s_max_i32 s12, s10, s11 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_max_i32 s13, s8, s9 ; CI-NEXT: v_mov_b32_e32 v0, s12 -; CI-NEXT: s_min_i32 s4, s4, s6 -; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 offset:2 +; CI-NEXT: s_min_i32 s10, s10, s11 +; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s13 -; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: s_min_i32 s5, s5, s7 -; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: s_min_i32 s8, s8, s9 +; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v0, s10 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:2 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s5 +; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_endpgm @@ -630,24 +630,24 @@ define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) #0 { ; GFX9-LABEL: v_min_max_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v1, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v2, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v3, v1, v2 ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v3, s[4:5] +; GFX9-NEXT: global_store_dword v0, v3, s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_min_max_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -675,7 +675,7 @@ define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace ; ; CI-LABEL: v_min_max_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s14, s10 @@ -727,24 +727,24 @@ define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i16> %val0, <4 x i16> %val1) #0 { ; GFX9-LABEL: s_min_max_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s11 -; GFX9-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-NEXT: v_pk_max_i16 v1, s9, v2 -; GFX9-NEXT: v_pk_max_i16 v0, s8, v5 -; GFX9-NEXT: v_pk_min_i16 v3, s9, v2 -; GFX9-NEXT: v_pk_min_i16 v2, s8, v5 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-NEXT: v_pk_max_i16 v1, s13, v2 +; GFX9-NEXT: v_pk_max_i16 v0, s12, v5 +; GFX9-NEXT: v_pk_min_i16 v3, s13, v2 +; GFX9-NEXT: v_pk_min_i16 v2, s12, v5 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_min_max_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -790,7 +790,7 @@ define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace ; ; CI-LABEL: s_min_max_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -853,12 +853,12 @@ define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) #0 { ; GFX9-LABEL: v_min_max_v2i16_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[8:9] glc +; GFX9-NEXT: global_load_dword v1, v0, s[12:13] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[10:11] glc +; GFX9-NEXT: global_load_dword v2, v0, s[14:15] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-NEXT: v_cmp_gt_i32_sdwa vcc, sext(v1), sext(v2) src0_sel:WORD_0 src1_sel:WORD_0 @@ -876,9 +876,9 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr ; GFX9-NEXT: v_lshl_or_b32 v4, v6, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX9-NEXT: global_store_dword v0, v4, s[4:5] +; GFX9-NEXT: global_store_dword v0, v4, s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v0, off @@ -887,7 +887,7 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr ; ; VI-LABEL: v_min_max_v2i16_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -940,7 +940,7 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr ; ; CI-LABEL: v_min_max_v2i16_user: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_mov_b32 s2, s6 @@ -1002,41 +1002,41 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr define amdgpu_kernel void @u_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind { ; GFX9-LABEL: u_min_max_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_pk_max_u16 v2, s0, v1 -; GFX9-NEXT: v_pk_min_u16 v1, s0, v1 -; GFX9-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_pk_max_u16 v2, s6, v1 +; GFX9-NEXT: v_pk_min_u16 v1, s6, v1 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: u_min_max_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_lshr_b32 s2, s0, 16 -; VI-NEXT: s_lshr_b32 s3, s1, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_max_u32 s5, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_max_u32 s4, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: s_lshr_b32 s2, s5, 16 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: s_and_b32 s3, s5, 0xffff +; VI-NEXT: s_max_u32 s5, s0, s2 +; VI-NEXT: s_max_u32 s4, s1, s3 ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_min_u32 s0, s0, s1 -; VI-NEXT: s_min_u32 s1, s2, s3 +; VI-NEXT: s_min_u32 s0, s0, s2 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_min_u32 s1, s1, s3 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1046,35 +1046,35 @@ define amdgpu_kernel void @u_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace ; ; CI-LABEL: u_min_max_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_lshr_b32 s4, s12, 16 -; CI-NEXT: s_lshr_b32 s6, s13, 16 -; CI-NEXT: s_mov_b32 s9, s5 -; CI-NEXT: s_mov_b32 s1, s7 -; CI-NEXT: s_and_b32 s5, s12, 0xffff -; CI-NEXT: s_and_b32 s7, s13, 0xffff -; CI-NEXT: s_max_u32 s13, s4, s6 -; CI-NEXT: s_max_u32 s12, s5, s7 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_lshr_b32 s10, s8, 16 +; CI-NEXT: s_lshr_b32 s11, s9, 16 +; CI-NEXT: s_and_b32 s8, s8, 0xffff +; CI-NEXT: s_and_b32 s9, s9, 0xffff +; CI-NEXT: s_max_u32 s13, s10, s11 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_max_u32 s12, s8, s9 ; CI-NEXT: v_mov_b32_e32 v0, s13 -; CI-NEXT: s_min_u32 s4, s4, s6 -; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 offset:2 +; CI-NEXT: s_min_u32 s8, s8, s9 +; CI-NEXT: s_min_u32 s9, s10, s11 +; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s12 -; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: s_min_u32 s5, s5, s7 -; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; CI-NEXT: s_mov_b32 s0, s2 +; CI-NEXT: s_mov_b32 s1, s3 +; CI-NEXT: s_mov_b32 s2, s6 +; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v0, s9 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:2 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s5 +; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll index c9c00a84e0f4bf..52db7fea08e053 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -92,8 +92,8 @@ entry: ; GCN-LABEL: {{^}}smrd6: ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4 ; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 -; GFX9_10: s_add_u32 s0, s6, -4 -; GFX9_10: s_addc_u32 s1, s7, -1 +; GFX9_10: s_add_u32 s2, s2, -4 +; GFX9_10: s_addc_u32 s3, s3, -1 ; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll b/llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll index fc700c55d7ee52..e59e3f6de697f0 100644 --- a/llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll +++ b/llvm/test/CodeGen/AMDGPU/sopk-no-literal.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @test_sopk_size(i32 %var.mode) { ; GFX10-LABEL: test_sopk_size: ; GFX10: ; %bb.0: -; GFX10: s_load_b32 s0, s[2:3], 0x0 +; GFX10: s_load_b32 s0, s[4:5], 0x0 ; GFX10: s_mov_b32 s1, 3 ; GFX10: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s1 ; GFX10: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll index b2235544686f1c..d6d81ac67cd8e1 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -80,7 +80,7 @@ endif: ; preds = %else, %if ; Force save and restore of m0 during SMEM spill ; GCN-LABEL: {{^}}m0_unavailable_spill: -; GCN: s_load_dword [[REG0:s[0-9]+]], s[2:3], {{0x[0-9]+}} +; GCN: s_load_dword [[REG0:s[0-9]+]], s[4:5], {{0x[0-9]+}} ; GCN: ; def m0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll index daf0a2d1baa88a..0452c3b89e9a90 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @test_inst_offset_kernel() { ; MUBUF-LABEL: test_inst_offset_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s17 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -24,8 +24,8 @@ define amdgpu_kernel void @test_inst_offset_kernel() { ; ; FLATSCR-LABEL: test_inst_offset_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -61,7 +61,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_kernel() { ; MUBUF-LABEL: test_sgpr_offset_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s17 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -77,8 +77,8 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() { ; ; FLATSCR-LABEL: test_sgpr_offset_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -193,7 +193,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { ; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s17 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND @@ -215,8 +215,8 @@ define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { ; ; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: s_mov_b32 s8, 0 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND @@ -275,7 +275,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { ; MUBUF-LABEL: test_sgpr_offset_subregs_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s17 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -298,8 +298,8 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { ; ; FLATSCR-LABEL: test_sgpr_offset_subregs_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -341,7 +341,7 @@ entry: define amdgpu_kernel void @test_inst_offset_subregs_kernel() { ; MUBUF-LABEL: test_inst_offset_subregs_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s17 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -365,8 +365,8 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() { ; ; FLATSCR-LABEL: test_inst_offset_subregs_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 76ecbc0863650b..ae70abc7317c31 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 13, v0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX6-NEXT: s_mov_b32 s42, -1 ; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX6-NEXT: s_add_u32 s40, s40, s9 +; GFX6-NEXT: s_add_u32 s40, s40, s11 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 ; GFX6-NEXT: s_mov_b32 s2, 0x3fd00 ; GFX6-NEXT: s_mov_b64 s[8:9], 0x100 @@ -4668,7 +4668,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; ; GFX9-FLATSCR-LABEL: test: ; GFX9-FLATSCR: ; %bb.0: ; %entry -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0 @@ -4680,8 +4680,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 -; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill @@ -7294,11 +7294,11 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; ; GFX10-FLATSCR-LABEL: test: ; GFX10-FLATSCR: ; %bb.0: ; %entry -; GFX10-FLATSCR-NEXT: s_add_u32 s6, s6, s11 -; GFX10-FLATSCR-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLATSCR-NEXT: s_add_u32 s8, s8, s13 +; GFX10-FLATSCR-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0 @@ -9752,10 +9752,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 ; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s42, -1 ; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX6-NEXT: s_add_u32 s40, s40, s9 +; GFX6-NEXT: s_add_u32 s40, s40, s11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 @@ -10312,14 +10312,14 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-FLATSCR-LABEL: test_limited_sgpr: ; GFX9-FLATSCR: ; %bb.0: ; %entry -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 8, v0 -; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:240 -; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -10496,11 +10496,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; ; GFX10-FLATSCR-LABEL: test_limited_sgpr: ; GFX10-FLATSCR: ; %bb.0: ; %entry -; GFX10-FLATSCR-NEXT: s_add_u32 s6, s6, s11 -; GFX10-FLATSCR-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX10-FLATSCR-NEXT: s_add_u32 s8, s8, s13 +; GFX10-FLATSCR-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 ; GFX10-FLATSCR-NEXT: s_mov_b32 s33, exec_lo diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index 882356d994fc68..03e8e28ef54db2 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -5,17 +5,17 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-LABEL: name: test_spill_av_class ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr6_sgpr7 + ; GCN-NEXT: liveins: $sgpr8_sgpr9 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) + ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %12.sub0 + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %13.sub0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %12 + ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %13 ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll index bc13b8d0330177..241bab3bd3bc4a 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll @@ -44,7 +44,7 @@ define void @device_writelane_intrinsic(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @kernel_writelane_intrinsic(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GCN-LABEL: kernel_writelane_intrinsic: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v1, 45 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index cd06a060a50cd8..9d550ec27a63bf 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -94,7 +94,7 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -147,7 +147,7 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -175,7 +175,7 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -243,7 +243,7 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -282,7 +282,7 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -395,13 +395,13 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_ashr_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s5, s4, 31 -; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8 +; SI-NEXT: s_ashr_i32 s7, s6, 31 +; SI-NEXT: s_ashr_i64 s[4:5], s[6:7], 8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -409,13 +409,13 @@ define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_ashr_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s5, s4, 31 -; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8 +; VI-NEXT: s_ashr_i32 s7, s6, 31 +; VI-NEXT: s_ashr_i64 s[4:5], s[6:7], 8 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -443,7 +443,7 @@ entry: define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_i64_2: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -461,7 +461,7 @@ define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_i64_2: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -513,7 +513,7 @@ entry: define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -533,7 +533,7 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -597,7 +597,7 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -623,7 +623,7 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s10, s2 @@ -714,15 +714,15 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0x14 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s7, s6, 31 -; SI-NEXT: s_add_u32 s4, s6, s4 -; SI-NEXT: s_addc_u32 s5, s7, s5 +; SI-NEXT: s_ashr_i32 s5, s8, 31 +; SI-NEXT: s_add_u32 s4, s8, s6 +; SI-NEXT: s_addc_u32 s5, s5, s7 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -730,15 +730,15 @@ define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_ashr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[2:3], 0x50 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x50 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s7, s6, 31 -; VI-NEXT: s_add_u32 s4, s6, s4 -; VI-NEXT: s_addc_u32 s5, s7, s5 +; VI-NEXT: s_ashr_i32 s5, s8, 31 +; VI-NEXT: s_add_u32 s4, s8, s6 +; VI-NEXT: s_addc_u32 s5, s5, s7 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -768,7 +768,7 @@ define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_ashr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -785,7 +785,7 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_ashr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -833,15 +833,15 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_63_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0x14 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s6, s6, 31 -; SI-NEXT: s_add_u32 s4, s6, s4 -; SI-NEXT: s_addc_u32 s5, s6, s5 +; SI-NEXT: s_ashr_i32 s5, s8, 31 +; SI-NEXT: s_add_u32 s4, s5, s6 +; SI-NEXT: s_addc_u32 s5, s5, s7 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -849,15 +849,15 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_ashr_63_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[2:3], 0x50 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s8, s[4:5], 0x50 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s6, s6, 31 -; VI-NEXT: s_add_u32 s4, s6, s4 -; VI-NEXT: s_addc_u32 s5, s6, s5 +; VI-NEXT: s_ashr_i32 s5, s8, 31 +; VI-NEXT: s_add_u32 s4, s5, s6 +; VI-NEXT: s_addc_u32 s5, s5, s7 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -887,7 +887,7 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 % define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_ashr_63_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -905,7 +905,7 @@ define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_ashr_63_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index 1622f498dce65a..ce15bbcc9e189b 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -7,25 +7,25 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i16_7: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_ushort v1, v0, s[6:7] +; GCN-NEXT: global_load_ushort v1, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s0, v1 -; GCN-NEXT: s_sext_i32_i16 s0, s0 -; GCN-NEXT: s_mulk_i32 s0, 0x4925 -; GCN-NEXT: s_lshr_b32 s1, s0, 31 -; GCN-NEXT: s_ashr_i32 s0, s0, 17 -; GCN-NEXT: s_add_i32 s0, s0, s1 -; GCN-NEXT: s_mul_i32 s0, s0, 7 -; GCN-NEXT: v_subrev_u32_e32 v1, s0, v1 -; GCN-NEXT: global_store_short v0, v1, s[4:5] +; GCN-NEXT: v_readfirstlane_b32 s2, v1 +; GCN-NEXT: s_sext_i32_i16 s2, s2 +; GCN-NEXT: s_mulk_i32 s2, 0x4925 +; GCN-NEXT: s_lshr_b32 s3, s2, 31 +; GCN-NEXT: s_ashr_i32 s2, s2, 17 +; GCN-NEXT: s_add_i32 s2, s2, s3 +; GCN-NEXT: s_mul_i32 s2, s2, 7 +; GCN-NEXT: v_subrev_u32_e32 v1, s2, v1 +; GCN-NEXT: global_store_short v0, v1, s[0:1] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_i16_7: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -49,7 +49,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i16_7: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -113,43 +113,43 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s0, v1 -; GCN-NEXT: s_abs_i32 s0, s0 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_sub_i32 s3, 0, s0 -; GCN-NEXT: s_ashr_i32 s2, s1, 31 +; GCN-NEXT: v_readfirstlane_b32 s2, v1 +; GCN-NEXT: s_abs_i32 s2, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: s_sub_i32 s5, 0, s2 +; GCN-NEXT: s_ashr_i32 s4, s3, 31 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GCN-NEXT: s_abs_i32 s1, s1 +; GCN-NEXT: s_abs_i32 s3, s3 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: s_mul_i32 s3, s3, s6 -; GCN-NEXT: s_mul_hi_u32 s3, s6, s3 -; GCN-NEXT: s_add_i32 s6, s6, s3 -; GCN-NEXT: s_mul_hi_u32 s3, s1, s6 -; GCN-NEXT: s_mul_i32 s3, s3, s0 -; GCN-NEXT: s_sub_i32 s1, s1, s3 -; GCN-NEXT: s_sub_i32 s3, s1, s0 -; GCN-NEXT: s_cmp_ge_u32 s1, s0 -; GCN-NEXT: s_cselect_b32 s1, s3, s1 -; GCN-NEXT: s_sub_i32 s3, s1, s0 -; GCN-NEXT: s_cmp_ge_u32 s1, s0 -; GCN-NEXT: s_cselect_b32 s0, s3, s1 -; GCN-NEXT: s_xor_b32 s0, s0, s2 -; GCN-NEXT: s_sub_i32 s0, s0, s2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: global_store_dword v2, v0, s[4:5] +; GCN-NEXT: s_mul_i32 s5, s5, s6 +; GCN-NEXT: s_mul_hi_u32 s5, s6, s5 +; GCN-NEXT: s_add_i32 s6, s6, s5 +; GCN-NEXT: s_mul_hi_u32 s5, s3, s6 +; GCN-NEXT: s_mul_i32 s5, s5, s2 +; GCN-NEXT: s_sub_i32 s3, s3, s5 +; GCN-NEXT: s_sub_i32 s5, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s3, s5, s3 +; GCN-NEXT: s_sub_i32 s5, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s2, s5, s3 +; GCN-NEXT: s_xor_b32 s2, s2, s4 +; GCN-NEXT: s_sub_i32 s2, s2, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: global_store_dword v2, v0, s[0:1] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_i32: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -192,7 +192,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: srem_i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -277,22 +277,22 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dword v1, v0, s[6:7] +; GCN-NEXT: global_load_dword v1, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 30, v2 ; GCN-NEXT: v_add_u32_e32 v2, v1, v2 ; GCN-NEXT: v_and_b32_e32 v2, -4, v2 ; GCN-NEXT: v_sub_u32_e32 v1, v1, v2 -; GCN-NEXT: global_store_dword v0, v1, s[4:5] +; GCN-NEXT: global_store_dword v0, v1, s[0:1] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_i32_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -314,7 +314,7 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -363,25 +363,25 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32_7: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b32 s0, 0x92492493 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dword v1, v0, s[6:7] +; GCN-NEXT: global_load_dword v1, v0, s[2:3] +; GCN-NEXT: s_mov_b32 s2, 0x92492493 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_hi_i32 v2, v1, s0 +; GCN-NEXT: v_mul_hi_i32 v2, v1, s2 ; GCN-NEXT: v_add_u32_e32 v2, v2, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 31, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 2, v2 ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, 7 ; GCN-NEXT: v_sub_u32_e32 v1, v1, v2 -; GCN-NEXT: global_store_dword v0, v1, s[4:5] +; GCN-NEXT: global_store_dword v0, v1, s[0:1] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_i32_7: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -406,7 +406,7 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i32_7: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -459,69 +459,69 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-NEXT: s_abs_i32 s0, s0 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s0 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 -; GCN-NEXT: s_sub_i32 s6, 0, s0 -; GCN-NEXT: s_ashr_i32 s3, s1, 31 +; GCN-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-NEXT: s_abs_i32 s2, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GCN-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-NEXT: s_sub_i32 s6, 0, s2 +; GCN-NEXT: s_ashr_i32 s5, s3, 31 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GCN-NEXT: s_abs_i32 s1, s1 -; GCN-NEXT: v_readfirstlane_b32 s2, v3 +; GCN-NEXT: s_abs_i32 s3, s3 +; GCN-NEXT: v_readfirstlane_b32 s4, v3 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_readfirstlane_b32 s7, v0 ; GCN-NEXT: s_mul_i32 s6, s6, s7 ; GCN-NEXT: s_mul_hi_u32 s6, s7, s6 ; GCN-NEXT: s_add_i32 s7, s7, s6 -; GCN-NEXT: s_mul_hi_u32 s6, s1, s7 -; GCN-NEXT: s_mul_i32 s6, s6, s0 -; GCN-NEXT: s_sub_i32 s1, s1, s6 -; GCN-NEXT: s_sub_i32 s6, s1, s0 -; GCN-NEXT: s_cmp_ge_u32 s1, s0 -; GCN-NEXT: s_cselect_b32 s1, s6, s1 -; GCN-NEXT: s_sub_i32 s6, s1, s0 -; GCN-NEXT: s_cmp_ge_u32 s1, s0 -; GCN-NEXT: s_cselect_b32 s0, s6, s1 -; GCN-NEXT: s_abs_i32 s1, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GCN-NEXT: s_xor_b32 s0, s0, s3 -; GCN-NEXT: s_sub_i32 s7, 0, s1 -; GCN-NEXT: s_sub_i32 s0, s0, s3 +; GCN-NEXT: s_mul_hi_u32 s6, s3, s7 +; GCN-NEXT: s_mul_i32 s6, s6, s2 +; GCN-NEXT: s_sub_i32 s3, s3, s6 +; GCN-NEXT: s_sub_i32 s6, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s3, s6, s3 +; GCN-NEXT: s_sub_i32 s6, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s2, s6, s3 +; GCN-NEXT: s_abs_i32 s3, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GCN-NEXT: s_xor_b32 s2, s2, s5 +; GCN-NEXT: s_sub_i32 s7, 0, s3 +; GCN-NEXT: s_sub_i32 s2, s2, s5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s2, v1 -; GCN-NEXT: s_ashr_i32 s6, s2, 31 -; GCN-NEXT: s_abs_i32 s2, s2 +; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: s_ashr_i32 s6, s4, 31 +; GCN-NEXT: s_abs_i32 s4, s4 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s3, v0 -; GCN-NEXT: s_mul_i32 s7, s7, s3 -; GCN-NEXT: s_mul_hi_u32 s7, s3, s7 -; GCN-NEXT: s_add_i32 s3, s3, s7 -; GCN-NEXT: s_mul_hi_u32 s3, s2, s3 -; GCN-NEXT: s_mul_i32 s3, s3, s1 -; GCN-NEXT: s_sub_i32 s2, s2, s3 -; GCN-NEXT: s_sub_i32 s3, s2, s1 -; GCN-NEXT: s_cmp_ge_u32 s2, s1 -; GCN-NEXT: s_cselect_b32 s2, s3, s2 -; GCN-NEXT: s_sub_i32 s3, s2, s1 -; GCN-NEXT: s_cmp_ge_u32 s2, s1 -; GCN-NEXT: s_cselect_b32 s1, s3, s2 -; GCN-NEXT: s_xor_b32 s1, s1, s6 -; GCN-NEXT: s_sub_i32 s1, s1, s6 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GCN-NEXT: v_readfirstlane_b32 s5, v0 +; GCN-NEXT: s_mul_i32 s7, s7, s5 +; GCN-NEXT: s_mul_hi_u32 s7, s5, s7 +; GCN-NEXT: s_add_i32 s5, s5, s7 +; GCN-NEXT: s_mul_hi_u32 s5, s4, s5 +; GCN-NEXT: s_mul_i32 s5, s5, s3 +; GCN-NEXT: s_sub_i32 s4, s4, s5 +; GCN-NEXT: s_sub_i32 s5, s4, s3 +; GCN-NEXT: s_cmp_ge_u32 s4, s3 +; GCN-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-NEXT: s_sub_i32 s5, s4, s3 +; GCN-NEXT: s_cmp_ge_u32 s4, s3 +; GCN-NEXT: s_cselect_b32 s3, s5, s4 +; GCN-NEXT: s_xor_b32 s3, s3, s6 +; GCN-NEXT: s_sub_i32 s3, s3, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v2i32: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -590,7 +590,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v2i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -723,31 +723,31 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: s_ashr_i32 s2, s0, 31 -; GCN-NEXT: s_ashr_i32 s3, s1, 31 -; GCN-NEXT: s_lshr_b32 s2, s2, 30 -; GCN-NEXT: s_lshr_b32 s3, s3, 30 -; GCN-NEXT: s_add_i32 s2, s0, s2 -; GCN-NEXT: s_add_i32 s3, s1, s3 -; GCN-NEXT: s_and_b32 s2, s2, -4 -; GCN-NEXT: s_and_b32 s3, s3, -4 -; GCN-NEXT: s_sub_i32 s0, s0, s2 -; GCN-NEXT: s_sub_i32 s1, s1, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: v_readfirstlane_b32 s3, v1 +; GCN-NEXT: s_ashr_i32 s4, s2, 31 +; GCN-NEXT: s_ashr_i32 s5, s3, 31 +; GCN-NEXT: s_lshr_b32 s4, s4, 30 +; GCN-NEXT: s_lshr_b32 s5, s5, 30 +; GCN-NEXT: s_add_i32 s4, s2, s4 +; GCN-NEXT: s_add_i32 s5, s3, s5 +; GCN-NEXT: s_and_b32 s4, s4, -4 +; GCN-NEXT: s_and_b32 s5, s5, -4 +; GCN-NEXT: s_sub_i32 s2, s2, s4 +; GCN-NEXT: s_sub_i32 s3, s3, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v2i32_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -778,7 +778,7 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v2i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -842,123 +842,123 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[1:4], v0, s[6:7] offset:16 -; GCN-NEXT: global_load_dwordx4 v[5:8], v0, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] offset:16 +; GCN-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_readfirstlane_b32 s0, v1 -; GCN-NEXT: s_abs_i32 s0, s0 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-NEXT: s_sub_i32 s6, 0, s0 +; GCN-NEXT: v_readfirstlane_b32 s2, v1 +; GCN-NEXT: s_abs_i32 s2, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GCN-NEXT: s_sub_i32 s6, 0, s2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s2, v5 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 +; GCN-NEXT: v_readfirstlane_b32 s4, v5 +; GCN-NEXT: s_ashr_i32 s5, s4, 31 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GCN-NEXT: s_abs_i32 s2, s2 -; GCN-NEXT: v_readfirstlane_b32 s1, v2 +; GCN-NEXT: s_abs_i32 s4, s4 +; GCN-NEXT: v_readfirstlane_b32 s3, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_readfirstlane_b32 s7, v1 ; GCN-NEXT: s_mul_i32 s6, s6, s7 ; GCN-NEXT: s_mul_hi_u32 s6, s7, s6 ; GCN-NEXT: s_add_i32 s7, s7, s6 -; GCN-NEXT: s_mul_hi_u32 s6, s2, s7 -; GCN-NEXT: s_mul_i32 s6, s6, s0 -; GCN-NEXT: s_sub_i32 s2, s2, s6 -; GCN-NEXT: s_sub_i32 s6, s2, s0 -; GCN-NEXT: s_cmp_ge_u32 s2, s0 -; GCN-NEXT: s_cselect_b32 s2, s6, s2 -; GCN-NEXT: s_sub_i32 s6, s2, s0 -; GCN-NEXT: s_cmp_ge_u32 s2, s0 -; GCN-NEXT: s_cselect_b32 s0, s6, s2 -; GCN-NEXT: s_abs_i32 s1, s1 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GCN-NEXT: s_xor_b32 s0, s0, s3 -; GCN-NEXT: s_sub_i32 s8, 0, s1 -; GCN-NEXT: s_sub_i32 s0, s0, s3 +; GCN-NEXT: s_mul_hi_u32 s6, s4, s7 +; GCN-NEXT: s_mul_i32 s6, s6, s2 +; GCN-NEXT: s_sub_i32 s4, s4, s6 +; GCN-NEXT: s_sub_i32 s6, s4, s2 +; GCN-NEXT: s_cmp_ge_u32 s4, s2 +; GCN-NEXT: s_cselect_b32 s4, s6, s4 +; GCN-NEXT: s_sub_i32 s6, s4, s2 +; GCN-NEXT: s_cmp_ge_u32 s4, s2 +; GCN-NEXT: s_cselect_b32 s2, s6, s4 +; GCN-NEXT: s_abs_i32 s3, s3 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GCN-NEXT: s_xor_b32 s2, s2, s5 +; GCN-NEXT: s_sub_i32 s8, 0, s3 +; GCN-NEXT: s_sub_i32 s2, s2, s5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GCN-NEXT: v_readfirstlane_b32 s6, v6 ; GCN-NEXT: s_ashr_i32 s7, s6, 31 ; GCN-NEXT: s_abs_i32 s6, s6 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s2, v3 -; GCN-NEXT: v_readfirstlane_b32 s3, v1 -; GCN-NEXT: s_mul_i32 s8, s8, s3 -; GCN-NEXT: s_mul_hi_u32 s8, s3, s8 -; GCN-NEXT: s_add_i32 s3, s3, s8 -; GCN-NEXT: s_mul_hi_u32 s3, s6, s3 -; GCN-NEXT: s_mul_i32 s3, s3, s1 -; GCN-NEXT: s_sub_i32 s3, s6, s3 -; GCN-NEXT: s_sub_i32 s6, s3, s1 -; GCN-NEXT: s_cmp_ge_u32 s3, s1 -; GCN-NEXT: s_cselect_b32 s3, s6, s3 -; GCN-NEXT: s_sub_i32 s6, s3, s1 -; GCN-NEXT: s_cmp_ge_u32 s3, s1 -; GCN-NEXT: s_cselect_b32 s1, s6, s3 -; GCN-NEXT: s_abs_i32 s2, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-NEXT: s_xor_b32 s1, s1, s7 -; GCN-NEXT: s_sub_i32 s9, 0, s2 -; GCN-NEXT: s_sub_i32 s1, s1, s7 +; GCN-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NEXT: s_mul_i32 s8, s8, s5 +; GCN-NEXT: s_mul_hi_u32 s8, s5, s8 +; GCN-NEXT: s_add_i32 s5, s5, s8 +; GCN-NEXT: s_mul_hi_u32 s5, s6, s5 +; GCN-NEXT: s_mul_i32 s5, s5, s3 +; GCN-NEXT: s_sub_i32 s5, s6, s5 +; GCN-NEXT: s_sub_i32 s6, s5, s3 +; GCN-NEXT: s_cmp_ge_u32 s5, s3 +; GCN-NEXT: s_cselect_b32 s5, s6, s5 +; GCN-NEXT: s_sub_i32 s6, s5, s3 +; GCN-NEXT: s_cmp_ge_u32 s5, s3 +; GCN-NEXT: s_cselect_b32 s3, s6, s5 +; GCN-NEXT: s_abs_i32 s4, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GCN-NEXT: s_xor_b32 s3, s3, s7 +; GCN-NEXT: s_sub_i32 s9, 0, s4 +; GCN-NEXT: s_sub_i32 s3, s3, s7 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GCN-NEXT: v_readfirstlane_b32 s6, v7 ; GCN-NEXT: s_ashr_i32 s8, s6, 31 ; GCN-NEXT: s_abs_i32 s6, s6 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s3, v4 +; GCN-NEXT: v_readfirstlane_b32 s5, v4 ; GCN-NEXT: v_readfirstlane_b32 s7, v1 ; GCN-NEXT: s_mul_i32 s9, s9, s7 ; GCN-NEXT: s_mul_hi_u32 s9, s7, s9 ; GCN-NEXT: s_add_i32 s7, s7, s9 ; GCN-NEXT: s_mul_hi_u32 s7, s6, s7 -; GCN-NEXT: s_mul_i32 s7, s7, s2 +; GCN-NEXT: s_mul_i32 s7, s7, s4 ; GCN-NEXT: s_sub_i32 s6, s6, s7 -; GCN-NEXT: s_sub_i32 s7, s6, s2 -; GCN-NEXT: s_cmp_ge_u32 s6, s2 +; GCN-NEXT: s_sub_i32 s7, s6, s4 +; GCN-NEXT: s_cmp_ge_u32 s6, s4 ; GCN-NEXT: s_cselect_b32 s6, s7, s6 -; GCN-NEXT: s_sub_i32 s7, s6, s2 -; GCN-NEXT: s_cmp_ge_u32 s6, s2 -; GCN-NEXT: s_cselect_b32 s2, s7, s6 -; GCN-NEXT: s_abs_i32 s3, s3 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GCN-NEXT: s_sub_i32 s7, s6, s4 +; GCN-NEXT: s_cmp_ge_u32 s6, s4 +; GCN-NEXT: s_cselect_b32 s4, s7, s6 +; GCN-NEXT: s_abs_i32 s5, s5 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s5 ; GCN-NEXT: v_readfirstlane_b32 s6, v8 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: s_ashr_i32 s0, s6, 31 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: s_ashr_i32 s2, s6, 31 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: s_abs_i32 s1, s6 -; GCN-NEXT: s_sub_i32 s6, 0, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_abs_i32 s3, s6 +; GCN-NEXT: s_sub_i32 s6, 0, s5 ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: s_xor_b32 s2, s2, s8 -; GCN-NEXT: s_sub_i32 s2, s2, s8 +; GCN-NEXT: s_xor_b32 s4, s4, s8 +; GCN-NEXT: s_sub_i32 s4, s4, s8 ; GCN-NEXT: v_readfirstlane_b32 s7, v3 ; GCN-NEXT: s_mul_i32 s6, s6, s7 ; GCN-NEXT: s_mul_hi_u32 s6, s7, s6 ; GCN-NEXT: s_add_i32 s7, s7, s6 -; GCN-NEXT: s_mul_hi_u32 s6, s1, s7 -; GCN-NEXT: s_mul_i32 s6, s6, s3 -; GCN-NEXT: s_sub_i32 s1, s1, s6 -; GCN-NEXT: s_sub_i32 s6, s1, s3 -; GCN-NEXT: s_cmp_ge_u32 s1, s3 -; GCN-NEXT: s_cselect_b32 s1, s6, s1 -; GCN-NEXT: s_sub_i32 s6, s1, s3 -; GCN-NEXT: s_cmp_ge_u32 s1, s3 -; GCN-NEXT: s_cselect_b32 s1, s6, s1 -; GCN-NEXT: s_xor_b32 s1, s1, s0 -; GCN-NEXT: s_sub_i32 s0, s1, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] +; GCN-NEXT: s_mul_hi_u32 s6, s3, s7 +; GCN-NEXT: s_mul_i32 s6, s6, s5 +; GCN-NEXT: s_sub_i32 s3, s3, s6 +; GCN-NEXT: s_sub_i32 s6, s3, s5 +; GCN-NEXT: s_cmp_ge_u32 s3, s5 +; GCN-NEXT: s_cselect_b32 s3, s6, s3 +; GCN-NEXT: s_sub_i32 s6, s3, s5 +; GCN-NEXT: s_cmp_ge_u32 s3, s5 +; GCN-NEXT: s_cselect_b32 s3, s6, s3 +; GCN-NEXT: s_xor_b32 s3, s3, s2 +; GCN-NEXT: s_sub_i32 s2, s3, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v4i32: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -1081,7 +1081,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v4i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s4, s2, 16 ; TONGA-NEXT: s_addc_u32 s5, s3, 0 @@ -1317,45 +1317,45 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: v_readfirstlane_b32 s2, v2 -; GCN-NEXT: v_readfirstlane_b32 s3, v3 -; GCN-NEXT: s_ashr_i32 s6, s0, 31 -; GCN-NEXT: s_ashr_i32 s7, s1, 31 -; GCN-NEXT: s_ashr_i32 s8, s2, 31 -; GCN-NEXT: s_ashr_i32 s9, s3, 31 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: v_readfirstlane_b32 s3, v1 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NEXT: s_ashr_i32 s6, s2, 31 +; GCN-NEXT: s_ashr_i32 s7, s3, 31 +; GCN-NEXT: s_ashr_i32 s8, s4, 31 +; GCN-NEXT: s_ashr_i32 s9, s5, 31 ; GCN-NEXT: s_lshr_b32 s6, s6, 30 ; GCN-NEXT: s_lshr_b32 s7, s7, 30 ; GCN-NEXT: s_lshr_b32 s8, s8, 30 ; GCN-NEXT: s_lshr_b32 s9, s9, 30 -; GCN-NEXT: s_add_i32 s6, s0, s6 -; GCN-NEXT: s_add_i32 s7, s1, s7 -; GCN-NEXT: s_add_i32 s8, s2, s8 -; GCN-NEXT: s_add_i32 s9, s3, s9 +; GCN-NEXT: s_add_i32 s6, s2, s6 +; GCN-NEXT: s_add_i32 s7, s3, s7 +; GCN-NEXT: s_add_i32 s8, s4, s8 +; GCN-NEXT: s_add_i32 s9, s5, s9 ; GCN-NEXT: s_and_b32 s6, s6, -4 ; GCN-NEXT: s_and_b32 s7, s7, -4 ; GCN-NEXT: s_and_b32 s8, s8, -4 ; GCN-NEXT: s_and_b32 s9, s9, -4 -; GCN-NEXT: s_sub_i32 s0, s0, s6 -; GCN-NEXT: s_sub_i32 s1, s1, s7 -; GCN-NEXT: s_sub_i32 s2, s2, s8 -; GCN-NEXT: s_sub_i32 s3, s3, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-NEXT: s_sub_i32 s2, s2, s6 +; GCN-NEXT: s_sub_i32 s3, s3, s7 +; GCN-NEXT: s_sub_i32 s4, s4, s8 +; GCN-NEXT: s_sub_i32 s5, s5, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v4i32_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -1400,7 +1400,7 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v4i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -1491,24 +1491,24 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[10:11] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: v_readfirstlane_b32 s7, v3 -; GCN-NEXT: v_readfirstlane_b32 s6, v2 -; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[6:7] +; GCN-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[4:5] ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN-NEXT: s_cbranch_scc0 .LBB8_4 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_ashr_i32 s0, s7, 31 -; GCN-NEXT: s_add_u32 s2, s6, s0 +; GCN-NEXT: s_ashr_i32 s0, s5, 31 +; GCN-NEXT: s_add_u32 s2, s4, s0 ; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s3, s7, s0 +; GCN-NEXT: s_addc_u32 s3, s5, s0 ; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[0:1] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -1524,46 +1524,46 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_readfirstlane_b32 s2, v1 ; GCN-NEXT: v_readfirstlane_b32 s3, v0 -; GCN-NEXT: s_mul_i32 s7, s0, s2 +; GCN-NEXT: s_mul_i32 s5, s0, s2 ; GCN-NEXT: s_mul_hi_u32 s15, s0, s3 ; GCN-NEXT: s_mul_i32 s14, s1, s3 -; GCN-NEXT: s_add_i32 s7, s15, s7 -; GCN-NEXT: s_add_i32 s7, s7, s14 +; GCN-NEXT: s_add_i32 s5, s15, s5 +; GCN-NEXT: s_add_i32 s5, s5, s14 ; GCN-NEXT: s_mul_i32 s16, s0, s3 -; GCN-NEXT: s_mul_hi_u32 s14, s3, s7 -; GCN-NEXT: s_mul_i32 s15, s3, s7 +; GCN-NEXT: s_mul_hi_u32 s14, s3, s5 +; GCN-NEXT: s_mul_i32 s15, s3, s5 ; GCN-NEXT: s_mul_hi_u32 s3, s3, s16 ; GCN-NEXT: s_add_u32 s3, s3, s15 ; GCN-NEXT: s_addc_u32 s14, 0, s14 ; GCN-NEXT: s_mul_hi_u32 s17, s2, s16 ; GCN-NEXT: s_mul_i32 s16, s2, s16 ; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s15, s2, s7 +; GCN-NEXT: s_mul_hi_u32 s15, s2, s5 ; GCN-NEXT: s_addc_u32 s3, s14, s17 ; GCN-NEXT: s_addc_u32 s14, s15, 0 -; GCN-NEXT: s_mul_i32 s7, s2, s7 -; GCN-NEXT: s_add_u32 s3, s3, s7 -; GCN-NEXT: s_addc_u32 s7, 0, s14 +; GCN-NEXT: s_mul_i32 s5, s2, s5 +; GCN-NEXT: s_add_u32 s3, s3, s5 +; GCN-NEXT: s_addc_u32 s5, 0, s14 ; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s3, v0 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s7 -; GCN-NEXT: v_readfirstlane_b32 s7, v0 +; GCN-NEXT: s_addc_u32 s2, s2, s5 +; GCN-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NEXT: s_mul_i32 s3, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s14, s0, s7 +; GCN-NEXT: s_mul_hi_u32 s14, s0, s5 ; GCN-NEXT: s_add_i32 s3, s14, s3 -; GCN-NEXT: s_mul_i32 s1, s1, s7 +; GCN-NEXT: s_mul_i32 s1, s1, s5 ; GCN-NEXT: s_add_i32 s3, s3, s1 -; GCN-NEXT: s_mul_i32 s0, s0, s7 +; GCN-NEXT: s_mul_i32 s0, s0, s5 ; GCN-NEXT: s_mul_hi_u32 s14, s2, s0 ; GCN-NEXT: s_mul_i32 s15, s2, s0 -; GCN-NEXT: s_mul_i32 s17, s7, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s7, s0 -; GCN-NEXT: s_mul_hi_u32 s16, s7, s3 +; GCN-NEXT: s_mul_i32 s17, s5, s3 +; GCN-NEXT: s_mul_hi_u32 s0, s5, s0 +; GCN-NEXT: s_mul_hi_u32 s16, s5, s3 ; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s7, 0, s16 +; GCN-NEXT: s_addc_u32 s5, 0, s16 ; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_mul_hi_u32 s1, s2, s3 -; GCN-NEXT: s_addc_u32 s0, s7, s14 +; GCN-NEXT: s_addc_u32 s0, s5, s14 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mul_i32 s3, s2, s3 ; GCN-NEXT: s_add_u32 s0, s0, s3 @@ -1571,23 +1571,23 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0 ; GCN-NEXT: s_addc_u32 s2, s2, s1 -; GCN-NEXT: s_ashr_i32 s14, s9, 31 -; GCN-NEXT: s_add_u32 s0, s8, s14 +; GCN-NEXT: s_ashr_i32 s14, s7, 31 +; GCN-NEXT: s_add_u32 s0, s6, s14 ; GCN-NEXT: s_mov_b32 s15, s14 -; GCN-NEXT: s_addc_u32 s1, s9, s14 +; GCN-NEXT: s_addc_u32 s1, s7, s14 ; GCN-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] ; GCN-NEXT: v_readfirstlane_b32 s3, v0 ; GCN-NEXT: s_mul_i32 s1, s16, s2 -; GCN-NEXT: s_mul_hi_u32 s7, s16, s3 +; GCN-NEXT: s_mul_hi_u32 s5, s16, s3 ; GCN-NEXT: s_mul_hi_u32 s0, s16, s2 -; GCN-NEXT: s_add_u32 s1, s7, s1 +; GCN-NEXT: s_add_u32 s1, s5, s1 ; GCN-NEXT: s_addc_u32 s0, 0, s0 -; GCN-NEXT: s_mul_hi_u32 s9, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s7, s17, s3 ; GCN-NEXT: s_mul_i32 s3, s17, s3 ; GCN-NEXT: s_add_u32 s1, s1, s3 -; GCN-NEXT: s_mul_hi_u32 s7, s17, s2 -; GCN-NEXT: s_addc_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s7, 0 +; GCN-NEXT: s_mul_hi_u32 s5, s17, s2 +; GCN-NEXT: s_addc_u32 s0, s0, s7 +; GCN-NEXT: s_addc_u32 s1, s5, 0 ; GCN-NEXT: s_mul_i32 s2, s17, s2 ; GCN-NEXT: s_add_u32 s0, s0, s2 ; GCN-NEXT: s_addc_u32 s1, 0, s1 @@ -1596,15 +1596,15 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_add_i32 s1, s2, s1 ; GCN-NEXT: s_mul_i32 s2, s13, s0 ; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_i32 s7, s1, s2 +; GCN-NEXT: s_add_i32 s5, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_sub_i32 s1, s17, s7 +; GCN-NEXT: s_sub_i32 s1, s17, s5 ; GCN-NEXT: v_sub_co_u32_e32 v0, vcc, s16, v0 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_subb_u32 s9, s1, s13 +; GCN-NEXT: s_subb_u32 s7, s1, s13 ; GCN-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s12, v0 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s15, s9, 0 +; GCN-NEXT: s_subb_u32 s15, s7, 0 ; GCN-NEXT: s_cmp_ge_u32 s15, s13 ; GCN-NEXT: s_cselect_b32 s16, -1, 0 ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v1 @@ -1614,7 +1614,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[2:3] -; GCN-NEXT: s_subb_u32 s2, s9, s13 +; GCN-NEXT: s_subb_u32 s2, s7, s13 ; GCN-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v1 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN-NEXT: s_subb_u32 s2, s2, 0 @@ -1624,7 +1624,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: s_subb_u32 s0, s17, s7 +; GCN-NEXT: s_subb_u32 s0, s17, s5 ; GCN-NEXT: s_cmp_ge_u32 s0, s13 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 @@ -1644,8 +1644,8 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GCN-NEXT: s_cbranch_execnz .LBB8_3 ; GCN-NEXT: .LBB8_2: -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GCN-NEXT: s_sub_i32 s0, 0, s6 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: s_sub_i32 s0, 0, s4 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -1654,20 +1654,20 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_mul_i32 s0, s0, s2 ; GCN-NEXT: s_mul_hi_u32 s0, s2, s0 ; GCN-NEXT: s_add_i32 s2, s2, s0 -; GCN-NEXT: s_mul_hi_u32 s0, s8, s2 -; GCN-NEXT: s_mul_i32 s0, s0, s6 -; GCN-NEXT: s_sub_i32 s0, s8, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s6 -; GCN-NEXT: s_cmp_ge_u32 s0, s6 +; GCN-NEXT: s_mul_hi_u32 s0, s6, s2 +; GCN-NEXT: s_mul_i32 s0, s0, s4 +; GCN-NEXT: s_sub_i32 s0, s6, s0 +; GCN-NEXT: s_sub_i32 s2, s0, s4 +; GCN-NEXT: s_cmp_ge_u32 s0, s4 ; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s6 -; GCN-NEXT: s_cmp_ge_u32 s0, s6 +; GCN-NEXT: s_sub_i32 s2, s0, s4 +; GCN-NEXT: s_cmp_ge_u32 s0, s4 ; GCN-NEXT: s_cselect_b32 s0, s2, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: .LBB8_3: ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GCN-NEXT: s_endpgm ; GCN-NEXT: .LBB8_4: ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1675,7 +1675,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TAHITI-LABEL: srem_i64: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: v_mov_b32_e32 v4, 0 @@ -1836,7 +1836,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: srem_i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: v_mov_b32_e32 v4, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s6 @@ -2589,10 +2589,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 30, v3 @@ -2601,12 +2601,12 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_and_b32_e32 v3, -4, v3 ; GCN-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 ; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc -; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -2630,7 +2630,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -2684,11 +2684,11 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16 -; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[10:11] offset:16 +; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[10:11] ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_readfirstlane_b32 s11, v1 ; GCN-NEXT: v_readfirstlane_b32 s10, v0 @@ -2697,11 +2697,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_readfirstlane_b32 s12, v4 ; GCN-NEXT: s_or_b64 s[0:1], s[12:13], s[10:11] ; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: v_readfirstlane_b32 s7, v3 -; GCN-NEXT: v_readfirstlane_b32 s6, v2 -; GCN-NEXT: v_readfirstlane_b32 s9, v7 +; GCN-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NEXT: v_readfirstlane_b32 s7, v7 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: v_readfirstlane_b32 s8, v6 +; GCN-NEXT: v_readfirstlane_b32 s6, v6 ; GCN-NEXT: s_cbranch_scc0 .LBB10_7 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_ashr_i32 s0, s11, 31 @@ -2865,15 +2865,15 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: .LBB10_3: -; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[6:7] +; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[4:5] ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN-NEXT: s_cbranch_scc0 .LBB10_8 ; GCN-NEXT: ; %bb.4: -; GCN-NEXT: s_ashr_i32 s0, s7, 31 -; GCN-NEXT: s_add_u32 s2, s6, s0 +; GCN-NEXT: s_ashr_i32 s0, s5, 31 +; GCN-NEXT: s_add_u32 s2, s4, s0 ; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s3, s7, s0 +; GCN-NEXT: s_addc_u32 s3, s5, s0 ; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[0:1] ; GCN-NEXT: v_cvt_f32_u32_e32 v2, s12 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, s13 @@ -2889,46 +2889,46 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_readfirstlane_b32 s2, v3 ; GCN-NEXT: v_readfirstlane_b32 s3, v2 -; GCN-NEXT: s_mul_i32 s7, s0, s2 +; GCN-NEXT: s_mul_i32 s5, s0, s2 ; GCN-NEXT: s_mul_hi_u32 s15, s0, s3 ; GCN-NEXT: s_mul_i32 s14, s1, s3 -; GCN-NEXT: s_add_i32 s7, s15, s7 -; GCN-NEXT: s_add_i32 s7, s7, s14 +; GCN-NEXT: s_add_i32 s5, s15, s5 +; GCN-NEXT: s_add_i32 s5, s5, s14 ; GCN-NEXT: s_mul_i32 s16, s0, s3 -; GCN-NEXT: s_mul_hi_u32 s14, s3, s7 -; GCN-NEXT: s_mul_i32 s15, s3, s7 +; GCN-NEXT: s_mul_hi_u32 s14, s3, s5 +; GCN-NEXT: s_mul_i32 s15, s3, s5 ; GCN-NEXT: s_mul_hi_u32 s3, s3, s16 ; GCN-NEXT: s_add_u32 s3, s3, s15 ; GCN-NEXT: s_addc_u32 s14, 0, s14 ; GCN-NEXT: s_mul_hi_u32 s17, s2, s16 ; GCN-NEXT: s_mul_i32 s16, s2, s16 ; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s15, s2, s7 +; GCN-NEXT: s_mul_hi_u32 s15, s2, s5 ; GCN-NEXT: s_addc_u32 s3, s14, s17 ; GCN-NEXT: s_addc_u32 s14, s15, 0 -; GCN-NEXT: s_mul_i32 s7, s2, s7 -; GCN-NEXT: s_add_u32 s3, s3, s7 -; GCN-NEXT: s_addc_u32 s7, 0, s14 +; GCN-NEXT: s_mul_i32 s5, s2, s5 +; GCN-NEXT: s_add_u32 s3, s3, s5 +; GCN-NEXT: s_addc_u32 s5, 0, s14 ; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s3, v2 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s7 -; GCN-NEXT: v_readfirstlane_b32 s7, v2 +; GCN-NEXT: s_addc_u32 s2, s2, s5 +; GCN-NEXT: v_readfirstlane_b32 s5, v2 ; GCN-NEXT: s_mul_i32 s3, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s14, s0, s7 +; GCN-NEXT: s_mul_hi_u32 s14, s0, s5 ; GCN-NEXT: s_add_i32 s3, s14, s3 -; GCN-NEXT: s_mul_i32 s1, s1, s7 +; GCN-NEXT: s_mul_i32 s1, s1, s5 ; GCN-NEXT: s_add_i32 s3, s3, s1 -; GCN-NEXT: s_mul_i32 s0, s0, s7 +; GCN-NEXT: s_mul_i32 s0, s0, s5 ; GCN-NEXT: s_mul_hi_u32 s14, s2, s0 ; GCN-NEXT: s_mul_i32 s15, s2, s0 -; GCN-NEXT: s_mul_i32 s17, s7, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s7, s0 -; GCN-NEXT: s_mul_hi_u32 s16, s7, s3 +; GCN-NEXT: s_mul_i32 s17, s5, s3 +; GCN-NEXT: s_mul_hi_u32 s0, s5, s0 +; GCN-NEXT: s_mul_hi_u32 s16, s5, s3 ; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s7, 0, s16 +; GCN-NEXT: s_addc_u32 s5, 0, s16 ; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_mul_hi_u32 s1, s2, s3 -; GCN-NEXT: s_addc_u32 s0, s7, s14 +; GCN-NEXT: s_addc_u32 s0, s5, s14 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mul_i32 s3, s2, s3 ; GCN-NEXT: s_add_u32 s0, s0, s3 @@ -2936,23 +2936,23 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0 ; GCN-NEXT: s_addc_u32 s2, s2, s1 -; GCN-NEXT: s_ashr_i32 s14, s9, 31 -; GCN-NEXT: s_add_u32 s0, s8, s14 +; GCN-NEXT: s_ashr_i32 s14, s7, 31 +; GCN-NEXT: s_add_u32 s0, s6, s14 ; GCN-NEXT: s_mov_b32 s15, s14 -; GCN-NEXT: s_addc_u32 s1, s9, s14 +; GCN-NEXT: s_addc_u32 s1, s7, s14 ; GCN-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] ; GCN-NEXT: v_readfirstlane_b32 s3, v2 ; GCN-NEXT: s_mul_i32 s1, s16, s2 -; GCN-NEXT: s_mul_hi_u32 s7, s16, s3 +; GCN-NEXT: s_mul_hi_u32 s5, s16, s3 ; GCN-NEXT: s_mul_hi_u32 s0, s16, s2 -; GCN-NEXT: s_add_u32 s1, s7, s1 +; GCN-NEXT: s_add_u32 s1, s5, s1 ; GCN-NEXT: s_addc_u32 s0, 0, s0 -; GCN-NEXT: s_mul_hi_u32 s9, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s7, s17, s3 ; GCN-NEXT: s_mul_i32 s3, s17, s3 ; GCN-NEXT: s_add_u32 s1, s1, s3 -; GCN-NEXT: s_mul_hi_u32 s7, s17, s2 -; GCN-NEXT: s_addc_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s7, 0 +; GCN-NEXT: s_mul_hi_u32 s5, s17, s2 +; GCN-NEXT: s_addc_u32 s0, s0, s7 +; GCN-NEXT: s_addc_u32 s1, s5, 0 ; GCN-NEXT: s_mul_i32 s2, s17, s2 ; GCN-NEXT: s_add_u32 s0, s0, s2 ; GCN-NEXT: s_addc_u32 s1, 0, s1 @@ -2961,15 +2961,15 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_add_i32 s1, s2, s1 ; GCN-NEXT: s_mul_i32 s2, s13, s0 ; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_i32 s7, s1, s2 +; GCN-NEXT: s_add_i32 s5, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: s_sub_i32 s1, s17, s7 +; GCN-NEXT: s_sub_i32 s1, s17, s5 ; GCN-NEXT: v_sub_co_u32_e32 v2, vcc, s16, v2 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_subb_u32 s9, s1, s13 +; GCN-NEXT: s_subb_u32 s7, s1, s13 ; GCN-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v2 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s15, s9, 0 +; GCN-NEXT: s_subb_u32 s15, s7, 0 ; GCN-NEXT: s_cmp_ge_u32 s15, s13 ; GCN-NEXT: s_cselect_b32 s16, -1, 0 ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v3 @@ -2979,7 +2979,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[2:3] -; GCN-NEXT: s_subb_u32 s2, s9, s13 +; GCN-NEXT: s_subb_u32 s2, s7, s13 ; GCN-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v3 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN-NEXT: s_subb_u32 s2, s2, 0 @@ -2989,7 +2989,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mov_b32_e32 v5, s2 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GCN-NEXT: s_subb_u32 s0, s17, s7 +; GCN-NEXT: s_subb_u32 s0, s17, s5 ; GCN-NEXT: s_cmp_ge_u32 s0, s13 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 @@ -3009,27 +3009,27 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc ; GCN-NEXT: s_cbranch_execnz .LBB10_6 ; GCN-NEXT: .LBB10_5: -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GCN-NEXT: s_sub_i32 s0, 0, s6 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GCN-NEXT: s_sub_i32 s0, 0, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v2, v3 ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 -; GCN-NEXT: v_mul_hi_u32 v2, s8, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v2, s6 -; GCN-NEXT: v_sub_u32_e32 v2, s8, v2 -; GCN-NEXT: v_subrev_u32_e32 v3, s6, v2 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s6, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v2, s4 +; GCN-NEXT: v_sub_u32_e32 v2, s6, v2 +; GCN-NEXT: v_subrev_u32_e32 v3, s4, v2 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GCN-NEXT: v_subrev_u32_e32 v3, s6, v2 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 +; GCN-NEXT: v_subrev_u32_e32 v3, s4, v2 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: .LBB10_6: ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GCN-NEXT: s_endpgm ; GCN-NEXT: .LBB10_7: ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3039,7 +3039,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v2i64: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: v_mov_b32_e32 v8, 0 @@ -3346,7 +3346,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v2i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 16 @@ -4733,10 +4733,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 30, v5 @@ -4752,12 +4752,12 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GCN-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 ; GCN-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v8, vcc -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v2i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -4788,7 +4788,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v2i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -4860,28 +4860,28 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[10:13], v8, s[6:7] offset:32 -; GCN-NEXT: global_load_dwordx4 v[14:17], v8, s[6:7] -; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:48 -; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:16 +; GCN-NEXT: global_load_dwordx4 v[10:13], v8, s[10:11] offset:32 +; GCN-NEXT: global_load_dwordx4 v[14:17], v8, s[10:11] +; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[10:11] offset:48 +; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[10:11] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_readfirstlane_b32 s7, v11 -; GCN-NEXT: v_readfirstlane_b32 s6, v10 +; GCN-NEXT: v_readfirstlane_b32 s5, v11 +; GCN-NEXT: v_readfirstlane_b32 s4, v10 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_readfirstlane_b32 s9, v15 -; GCN-NEXT: v_readfirstlane_b32 s8, v14 -; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[6:7] +; GCN-NEXT: v_readfirstlane_b32 s7, v15 +; GCN-NEXT: v_readfirstlane_b32 s6, v14 +; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[4:5] ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN-NEXT: s_cbranch_scc0 .LBB12_13 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_ashr_i32 s0, s7, 31 -; GCN-NEXT: s_add_u32 s2, s6, s0 +; GCN-NEXT: s_ashr_i32 s0, s5, 31 +; GCN-NEXT: s_add_u32 s2, s4, s0 ; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s3, s7, s0 +; GCN-NEXT: s_addc_u32 s3, s5, s0 ; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[0:1] ; GCN-NEXT: v_cvt_f32_u32_e32 v8, s12 ; GCN-NEXT: v_cvt_f32_u32_e32 v9, s13 @@ -4897,46 +4897,46 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GCN-NEXT: v_readfirstlane_b32 s2, v9 ; GCN-NEXT: v_readfirstlane_b32 s3, v8 -; GCN-NEXT: s_mul_i32 s7, s0, s2 +; GCN-NEXT: s_mul_i32 s5, s0, s2 ; GCN-NEXT: s_mul_hi_u32 s15, s0, s3 ; GCN-NEXT: s_mul_i32 s14, s1, s3 -; GCN-NEXT: s_add_i32 s7, s15, s7 -; GCN-NEXT: s_add_i32 s7, s7, s14 +; GCN-NEXT: s_add_i32 s5, s15, s5 +; GCN-NEXT: s_add_i32 s5, s5, s14 ; GCN-NEXT: s_mul_i32 s16, s0, s3 -; GCN-NEXT: s_mul_hi_u32 s14, s3, s7 -; GCN-NEXT: s_mul_i32 s15, s3, s7 +; GCN-NEXT: s_mul_hi_u32 s14, s3, s5 +; GCN-NEXT: s_mul_i32 s15, s3, s5 ; GCN-NEXT: s_mul_hi_u32 s3, s3, s16 ; GCN-NEXT: s_add_u32 s3, s3, s15 ; GCN-NEXT: s_addc_u32 s14, 0, s14 ; GCN-NEXT: s_mul_hi_u32 s17, s2, s16 ; GCN-NEXT: s_mul_i32 s16, s2, s16 ; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s15, s2, s7 +; GCN-NEXT: s_mul_hi_u32 s15, s2, s5 ; GCN-NEXT: s_addc_u32 s3, s14, s17 ; GCN-NEXT: s_addc_u32 s14, s15, 0 -; GCN-NEXT: s_mul_i32 s7, s2, s7 -; GCN-NEXT: s_add_u32 s3, s3, s7 -; GCN-NEXT: s_addc_u32 s7, 0, s14 +; GCN-NEXT: s_mul_i32 s5, s2, s5 +; GCN-NEXT: s_add_u32 s3, s3, s5 +; GCN-NEXT: s_addc_u32 s5, 0, s14 ; GCN-NEXT: v_add_co_u32_e32 v8, vcc, s3, v8 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s7 -; GCN-NEXT: v_readfirstlane_b32 s7, v8 +; GCN-NEXT: s_addc_u32 s2, s2, s5 +; GCN-NEXT: v_readfirstlane_b32 s5, v8 ; GCN-NEXT: s_mul_i32 s3, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s14, s0, s7 +; GCN-NEXT: s_mul_hi_u32 s14, s0, s5 ; GCN-NEXT: s_add_i32 s3, s14, s3 -; GCN-NEXT: s_mul_i32 s1, s1, s7 +; GCN-NEXT: s_mul_i32 s1, s1, s5 ; GCN-NEXT: s_add_i32 s3, s3, s1 -; GCN-NEXT: s_mul_i32 s0, s0, s7 +; GCN-NEXT: s_mul_i32 s0, s0, s5 ; GCN-NEXT: s_mul_hi_u32 s14, s2, s0 ; GCN-NEXT: s_mul_i32 s15, s2, s0 -; GCN-NEXT: s_mul_i32 s17, s7, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s7, s0 -; GCN-NEXT: s_mul_hi_u32 s16, s7, s3 +; GCN-NEXT: s_mul_i32 s17, s5, s3 +; GCN-NEXT: s_mul_hi_u32 s0, s5, s0 +; GCN-NEXT: s_mul_hi_u32 s16, s5, s3 ; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s7, 0, s16 +; GCN-NEXT: s_addc_u32 s5, 0, s16 ; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_mul_hi_u32 s1, s2, s3 -; GCN-NEXT: s_addc_u32 s0, s7, s14 +; GCN-NEXT: s_addc_u32 s0, s5, s14 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mul_i32 s3, s2, s3 ; GCN-NEXT: s_add_u32 s0, s0, s3 @@ -4944,23 +4944,23 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_add_co_u32_e32 v8, vcc, s0, v8 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0 ; GCN-NEXT: s_addc_u32 s2, s2, s1 -; GCN-NEXT: s_ashr_i32 s14, s9, 31 -; GCN-NEXT: s_add_u32 s0, s8, s14 +; GCN-NEXT: s_ashr_i32 s14, s7, 31 +; GCN-NEXT: s_add_u32 s0, s6, s14 ; GCN-NEXT: s_mov_b32 s15, s14 -; GCN-NEXT: s_addc_u32 s1, s9, s14 +; GCN-NEXT: s_addc_u32 s1, s7, s14 ; GCN-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] ; GCN-NEXT: v_readfirstlane_b32 s3, v8 ; GCN-NEXT: s_mul_i32 s1, s16, s2 -; GCN-NEXT: s_mul_hi_u32 s7, s16, s3 +; GCN-NEXT: s_mul_hi_u32 s5, s16, s3 ; GCN-NEXT: s_mul_hi_u32 s0, s16, s2 -; GCN-NEXT: s_add_u32 s1, s7, s1 +; GCN-NEXT: s_add_u32 s1, s5, s1 ; GCN-NEXT: s_addc_u32 s0, 0, s0 -; GCN-NEXT: s_mul_hi_u32 s9, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s7, s17, s3 ; GCN-NEXT: s_mul_i32 s3, s17, s3 ; GCN-NEXT: s_add_u32 s1, s1, s3 -; GCN-NEXT: s_mul_hi_u32 s7, s17, s2 -; GCN-NEXT: s_addc_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s7, 0 +; GCN-NEXT: s_mul_hi_u32 s5, s17, s2 +; GCN-NEXT: s_addc_u32 s0, s0, s7 +; GCN-NEXT: s_addc_u32 s1, s5, 0 ; GCN-NEXT: s_mul_i32 s2, s17, s2 ; GCN-NEXT: s_add_u32 s0, s0, s2 ; GCN-NEXT: s_addc_u32 s1, 0, s1 @@ -4969,15 +4969,15 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_add_i32 s1, s2, s1 ; GCN-NEXT: s_mul_i32 s2, s13, s0 ; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_i32 s7, s1, s2 +; GCN-NEXT: s_add_i32 s5, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: s_sub_i32 s1, s17, s7 +; GCN-NEXT: s_sub_i32 s1, s17, s5 ; GCN-NEXT: v_sub_co_u32_e32 v8, vcc, s16, v8 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_subb_u32 s9, s1, s13 +; GCN-NEXT: s_subb_u32 s7, s1, s13 ; GCN-NEXT: v_subrev_co_u32_e64 v9, s[0:1], s12, v8 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s15, s9, 0 +; GCN-NEXT: s_subb_u32 s15, s7, 0 ; GCN-NEXT: s_cmp_ge_u32 s15, s13 ; GCN-NEXT: s_cselect_b32 s16, -1, 0 ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v9 @@ -4987,7 +4987,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[2:3] -; GCN-NEXT: s_subb_u32 s2, s9, s13 +; GCN-NEXT: s_subb_u32 s2, s7, s13 ; GCN-NEXT: v_subrev_co_u32_e64 v11, s[0:1], s12, v9 ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN-NEXT: s_subb_u32 s2, s2, 0 @@ -4997,7 +4997,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mov_b32_e32 v11, s2 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] -; GCN-NEXT: s_subb_u32 s0, s17, s7 +; GCN-NEXT: s_subb_u32 s0, s17, s5 ; GCN-NEXT: s_cmp_ge_u32 s0, s13 ; GCN-NEXT: s_cselect_b32 s1, -1, 0 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v8 @@ -5017,8 +5017,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v10, vcc ; GCN-NEXT: s_cbranch_execnz .LBB12_3 ; GCN-NEXT: .LBB12_2: -; GCN-NEXT: v_cvt_f32_u32_e32 v8, s6 -; GCN-NEXT: s_sub_i32 s0, 0, s6 +; GCN-NEXT: v_cvt_f32_u32_e32 v8, s4 +; GCN-NEXT: s_sub_i32 s0, 0, s4 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; GCN-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 @@ -5027,14 +5027,14 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_mul_i32 s0, s0, s2 ; GCN-NEXT: s_mul_hi_u32 s0, s2, s0 ; GCN-NEXT: s_add_i32 s2, s2, s0 -; GCN-NEXT: s_mul_hi_u32 s0, s8, s2 -; GCN-NEXT: s_mul_i32 s0, s0, s6 -; GCN-NEXT: s_sub_i32 s0, s8, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s6 -; GCN-NEXT: s_cmp_ge_u32 s0, s6 +; GCN-NEXT: s_mul_hi_u32 s0, s6, s2 +; GCN-NEXT: s_mul_i32 s0, s0, s4 +; GCN-NEXT: s_sub_i32 s0, s6, s0 +; GCN-NEXT: s_sub_i32 s2, s0, s4 +; GCN-NEXT: s_cmp_ge_u32 s0, s4 ; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s6 -; GCN-NEXT: s_cmp_ge_u32 s0, s6 +; GCN-NEXT: s_sub_i32 s2, s0, s4 +; GCN-NEXT: s_cmp_ge_u32 s0, s4 ; GCN-NEXT: s_cselect_b32 s0, s2, s0 ; GCN-NEXT: v_mov_b32_e32 v9, s1 ; GCN-NEXT: v_mov_b32_e32 v8, s0 @@ -5470,8 +5470,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cndmask_b32_e32 v14, v0, v1, vcc ; GCN-NEXT: .LBB12_12: ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dwordx4 v0, v[12:15], s[4:5] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, v[8:11], s[4:5] +; GCN-NEXT: global_store_dwordx4 v0, v[12:15], s[8:9] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] ; GCN-NEXT: s_endpgm ; GCN-NEXT: .LBB12_13: ; GCN-NEXT: ; implicit-def: $vgpr8_vgpr9 @@ -5486,7 +5486,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v4i64: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: v_mov_b32_e32 v8, 0 @@ -6088,7 +6088,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v4i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 48 @@ -8883,11 +8883,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:16 +; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 30, v9 @@ -8918,13 +8918,13 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v15, vcc ; GCN-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v12 ; GCN-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v16, vcc -; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v4i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -8972,7 +8972,7 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v4i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 8498e9af46f2b5..cb8f82db92bbf8 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -122,14 +122,14 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GCN-IR-LABEL: s_test_srem: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[4:5] +; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[6:7] ; GCN-IR-NEXT: s_flbit_i32_b64 s18, s[2:3] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GCN-IR-NEXT: s_sub_u32 s12, s10, s18 @@ -153,47 +153,47 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s14 -; GCN-IR-NEXT: s_add_u32 s16, s4, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 -; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] -; GCN-IR-NEXT: s_add_u32 s10, s6, s18 -; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 +; GCN-IR-NEXT: s_add_u32 s16, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s17, s7, -1 +; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11] +; GCN-IR-NEXT: s_add_u32 s10, s4, s18 +; GCN-IR-NEXT: s_addc_u32 s11, s5, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 -; GCN-IR-NEXT: s_mov_b32 s7, 0 +; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s6, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s6, s17, s13 -; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 +; GCN-IR-NEXT: s_sub_u32 s4, s16, s12 +; GCN-IR-NEXT: s_subb_u32 s4, s17, s13 +; GCN-IR-NEXT: s_ashr_i32 s14, s4, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s6, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[4:5] +; GCN-IR-NEXT: s_and_b32 s4, s14, 1 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s14 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s15 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[14:15], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0 ; GCN-IR-NEXT: s_mov_b32 s12, s0 -; GCN-IR-NEXT: s_mul_i32 s0, s4, s9 +; GCN-IR-NEXT: s_mul_i32 s0, s6, s9 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s5, s8 +; GCN-IR-NEXT: s_mul_i32 s0, s7, s8 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s4, s8 +; GCN-IR-NEXT: s_mul_i32 s0, s6, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 @@ -442,72 +442,72 @@ define i64 @v_test_srem(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s5, s4, s8 -; GCN-NEXT: s_ashr_i32 s5, s5, 30 -; GCN-NEXT: s_or_b32 s5, s5, 1 +; GCN-NEXT: s_xor_b32 s1, s0, s8 +; GCN-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-NEXT: s_or_b32 s1, s1, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GCN-NEXT: s_cselect_b32 s5, s5, 0 -; GCN-NEXT: v_readfirstlane_b32 s6, v2 -; GCN-NEXT: s_add_i32 s5, s6, s5 -; GCN-NEXT: s_mul_i32 s5, s5, s8 -; GCN-NEXT: s_sub_i32 s4, s4, s5 -; GCN-NEXT: s_bfe_i32 s4, s4, 0x170000 -; GCN-NEXT: s_ashr_i32 s5, s4, 31 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-NEXT: s_add_i32 s1, s2, s1 +; GCN-NEXT: s_mul_i32 s1, s1, s8 +; GCN-NEXT: s_sub_i32 s0, s0, s1 +; GCN-NEXT: s_bfe_i32 s0, s0, 0x170000 +; GCN-NEXT: s_ashr_i32 s1, s0, 31 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 41 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s5, s4, s8 -; GCN-IR-NEXT: s_ashr_i32 s5, s5, 30 -; GCN-IR-NEXT: s_or_b32 s5, s5, 1 +; GCN-IR-NEXT: s_xor_b32 s1, s0, s8 +; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-IR-NEXT: s_or_b32 s1, s1, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GCN-IR-NEXT: s_cselect_b32 s5, s5, 0 -; GCN-IR-NEXT: v_readfirstlane_b32 s6, v2 -; GCN-IR-NEXT: s_add_i32 s5, s6, s5 -; GCN-IR-NEXT: s_mul_i32 s5, s5, s8 -; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 -; GCN-IR-NEXT: s_bfe_i32 s4, s4, 0x170000 -; GCN-IR-NEXT: s_ashr_i32 s5, s4, 31 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-IR-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-IR-NEXT: s_add_i32 s1, s2, s1 +; GCN-IR-NEXT: s_mul_i32 s1, s1, s8 +; GCN-IR-NEXT: s_sub_i32 s0, s0, s1 +; GCN-IR-NEXT: s_bfe_i32 s0, s0, 0x170000 +; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 41 %2 = ashr i64 %y, 41 @@ -519,72 +519,72 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s5, s4, s8 -; GCN-NEXT: s_ashr_i32 s5, s5, 30 -; GCN-NEXT: s_or_b32 s5, s5, 1 +; GCN-NEXT: s_xor_b32 s1, s0, s8 +; GCN-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-NEXT: s_or_b32 s1, s1, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GCN-NEXT: s_cselect_b32 s5, s5, 0 -; GCN-NEXT: v_readfirstlane_b32 s6, v2 -; GCN-NEXT: s_add_i32 s5, s6, s5 -; GCN-NEXT: s_mul_i32 s5, s5, s8 -; GCN-NEXT: s_sub_i32 s4, s4, s5 -; GCN-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GCN-NEXT: s_ashr_i32 s5, s4, 31 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-NEXT: s_add_i32 s1, s2, s1 +; GCN-NEXT: s_mul_i32 s1, s1, s8 +; GCN-NEXT: s_sub_i32 s0, s0, s1 +; GCN-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GCN-NEXT: s_ashr_i32 s1, s0, 31 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dword s5, s[4:5], 0xe +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[2:3], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s5, s4, s8 -; GCN-IR-NEXT: s_ashr_i32 s5, s5, 30 -; GCN-IR-NEXT: s_or_b32 s5, s5, 1 +; GCN-IR-NEXT: s_xor_b32 s1, s0, s8 +; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 +; GCN-IR-NEXT: s_or_b32 s1, s1, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GCN-IR-NEXT: s_cselect_b32 s5, s5, 0 -; GCN-IR-NEXT: v_readfirstlane_b32 s6, v2 -; GCN-IR-NEXT: s_add_i32 s5, s6, s5 -; GCN-IR-NEXT: s_mul_i32 s5, s5, s8 -; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 -; GCN-IR-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GCN-IR-NEXT: s_ashr_i32 s5, s4, 31 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-IR-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-IR-NEXT: s_add_i32 s1, s2, s1 +; GCN-IR-NEXT: s_mul_i32 s1, s1, s8 +; GCN-IR-NEXT: s_sub_i32 s0, s0, s1 +; GCN-IR-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 @@ -650,14 +650,14 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 ; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -691,14 +691,14 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 ; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -739,14 +739,14 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 ; GCN-NEXT: s_abs_i32 s8, s0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -780,14 +780,14 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s1, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 ; GCN-IR-NEXT: s_abs_i32 s8, s0 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -829,7 +829,7 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -840,7 +840,7 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_abs_i32 s2, s3 @@ -868,7 +868,7 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -879,7 +879,7 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_abs_i32 s2, s3 @@ -915,38 +915,37 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem33_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 31 -; GCN-NEXT: s_ashr_i32 s0, s1, 31 -; GCN-NEXT: s_add_u32 s8, s8, s0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s9, s9, s0 -; GCN-NEXT: s_xor_b64 s[12:13], s[8:9], s[0:1] -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GCN-NEXT: s_sub_u32 s0, 0, s12 -; GCN-NEXT: s_subb_u32 s1, 0, s13 -; GCN-NEXT: s_ashr_i32 s6, s7, 31 +; GCN-NEXT: s_ashr_i64 s[10:11], s[2:3], 31 +; GCN-NEXT: s_ashr_i64 s[6:7], s[4:5], 31 +; GCN-NEXT: s_ashr_i32 s4, s5, 31 +; GCN-NEXT: s_add_u32 s6, s6, s4 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_addc_u32 s7, s7, s4 +; GCN-NEXT: s_xor_b64 s[8:9], s[6:7], s[4:5] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GCN-NEXT: s_sub_u32 s2, 0, s8 +; GCN-NEXT: s_subb_u32 s4, 0, s9 +; GCN-NEXT: s_ashr_i32 s12, s3, 31 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: s_mov_b32 s8, s4 -; GCN-NEXT: s_mov_b32 s9, s5 +; GCN-NEXT: s_mov_b32 s13, s12 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -965,11 +964,12 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -985,101 +985,101 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: s_add_u32 s0, s2, s6 +; GCN-NEXT: s_add_u32 s2, s10, s12 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_addc_u32 s1, s3, s6 +; GCN-NEXT: s_addc_u32 s3, s11, s12 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[6:7] -; GCN-NEXT: v_mul_lo_u32 v2, s14, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s14, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s14, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s15, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s15, v1 +; GCN-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] +; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s15, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s15, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 -; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 +; GCN-NEXT: v_mul_lo_u32 v1, s8, v1 +; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 +; GCN-NEXT: v_mul_lo_u32 v3, s9, v0 +; GCN-NEXT: v_mul_lo_u32 v0, s8, v0 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s15, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s14, v0 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 +; GCN-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 +; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s15 +; GCN-NEXT: v_mov_b32_e32 v4, s11 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s6, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s6, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 +; GCN-NEXT: v_xor_b32_e32 v1, s12, v1 +; GCN-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem33_64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 31 -; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s2, s2, s0 -; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 -; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 -; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s8, s6, s10 -; GCN-IR-NEXT: s_subb_u32 s9, s7, s10 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 31 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 31 +; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31 +; GCN-IR-NEXT: s_mov_b32 s5, s4 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GCN-IR-NEXT: s_sub_u32 s6, s2, s4 +; GCN-IR-NEXT: s_subb_u32 s7, s3, s4 +; GCN-IR-NEXT: s_ashr_i32 s2, s9, 31 +; GCN-IR-NEXT: s_mov_b32 s3, s2 +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s8, s8, s2 +; GCN-IR-NEXT: s_subb_u32 s9, s9, s2 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[8:9], 0 ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[8:9] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[14:15] -; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[2:3] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[2:3], s[10:11] +; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[6:7] ; GCN-IR-NEXT: s_sub_u32 s14, s12, s20 ; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[14:15], 63 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 63 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[16:17] ; GCN-IR-NEXT: s_and_b64 s[10:11], s[16:17], exec -; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3 -; GCN-IR-NEXT: s_cselect_b32 s10, 0, s2 +; GCN-IR-NEXT: s_cselect_b32 s11, 0, s7 +; GCN-IR-NEXT: s_cselect_b32 s10, 0, s6 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 @@ -1088,41 +1088,41 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[16:17], 0 ; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s14 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[6:7], s14 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s16 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s16 ; GCN-IR-NEXT: s_add_u32 s18, s8, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s9, -1 -; GCN-IR-NEXT: s_not_b64 s[6:7], s[12:13] -; GCN-IR-NEXT: s_add_u32 s12, s6, s20 -; GCN-IR-NEXT: s_addc_u32 s13, s7, 0 +; GCN-IR-NEXT: s_not_b64 s[2:3], s[12:13] +; GCN-IR-NEXT: s_add_u32 s12, s2, s20 +; GCN-IR-NEXT: s_addc_u32 s13, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 -; GCN-IR-NEXT: s_mov_b32 s7, 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_lshr_b32 s6, s11, 31 +; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[2:3] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s6, s18, s14 -; GCN-IR-NEXT: s_subb_u32 s6, s19, s15 -; GCN-IR-NEXT: s_ashr_i32 s16, s6, 31 +; GCN-IR-NEXT: s_sub_u32 s2, s18, s14 +; GCN-IR-NEXT: s_subb_u32 s2, s19, s15 +; GCN-IR-NEXT: s_ashr_i32 s16, s2, 31 ; GCN-IR-NEXT: s_mov_b32 s17, s16 -; GCN-IR-NEXT: s_and_b32 s6, s16, 1 +; GCN-IR-NEXT: s_and_b32 s2, s16, 1 ; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[8:9] ; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 ; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[16:17], s[2:3] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 ; GCN-IR-NEXT: .LBB8_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[2:3], s[10:11] ; GCN-IR-NEXT: .LBB8_5: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s8, v0 @@ -1132,16 +1132,16 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: v_readfirstlane_b32 s12, v0 ; GCN-IR-NEXT: s_add_i32 s11, s12, s11 ; GCN-IR-NEXT: s_add_i32 s11, s11, s9 -; GCN-IR-NEXT: s_sub_u32 s2, s2, s8 -; GCN-IR-NEXT: s_subb_u32 s3, s3, s11 -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 -; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_sub_u32 s6, s6, s8 +; GCN-IR-NEXT: s_subb_u32 s7, s7, s11 +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] +; GCN-IR-NEXT: s_sub_u32 s4, s6, s4 +; GCN-IR-NEXT: s_subb_u32 s5, s7, s5 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 31 %2 = ashr i64 %y, 31 @@ -1153,18 +1153,18 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_srem24_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sext_i32_i16 s7, s7 -; GCN-NEXT: s_sext_i32_i16 s1, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24 +; GCN-NEXT: s_sext_i32_i16 s3, s3 +; GCN-NEXT: s_sext_i32_i16 s5, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_alignbit_b32 v2, s7, v2, 24 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_alignbit_b32 v2, s3, v2, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GCN-NEXT: v_xor_b32_e32 v5, v2, v0 @@ -1176,53 +1176,53 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 -; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[4:5], 24 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 +; GCN-IR-NEXT: s_sext_i32_i16 s1, s1 +; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 +; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 24 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 24 ; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[4:5], 16 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[0:1], 16 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], 16 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[0:1], 16 ; GCN-IR-NEXT: s_ashr_i32 s0, s1, 31 ; GCN-IR-NEXT: s_mov_b32 s1, s0 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[6:7], 16 -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s4, s4, s0 -; GCN-IR-NEXT: s_subb_u32 s5, s5, s0 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] +; GCN-IR-NEXT: s_sub_u32 s2, s2, s0 +; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 ; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11] ; GCN-IR-NEXT: s_sub_u32 s6, s6, s10 ; GCN-IR-NEXT: s_subb_u32 s7, s7, s10 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[6:7] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[4:5] +; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[2:3] ; GCN-IR-NEXT: s_sub_u32 s14, s12, s20 ; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[14:15], 63 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 63 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[16:17] ; GCN-IR-NEXT: s_and_b64 s[10:11], s[16:17], exec -; GCN-IR-NEXT: s_cselect_b32 s11, 0, s5 -; GCN-IR-NEXT: s_cselect_b32 s10, 0, s4 +; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3 +; GCN-IR-NEXT: s_cselect_b32 s10, 0, s2 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] @@ -1233,10 +1233,10 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[16:17], 0 ; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[4:5], s14 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s14 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[4:5], s16 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s16 ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] @@ -1271,15 +1271,15 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GCN-IR-NEXT: s_mul_i32 s2, s6, s11 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s5 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mul_i32 s2, s7, s10 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mul_i32 s2, s6, s10 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s2 -; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 +; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x9 +; GCN-IR-NEXT: s_mul_i32 s4, s6, s11 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-IR-NEXT: s_mul_i32 s4, s7, s10 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-IR-NEXT: s_mul_i32 s4, s6, s10 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s4 +; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 ; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v1, s0, v1 ; GCN-IR-NEXT: v_xor_b32_e32 v0, s1, v0 @@ -1302,7 +1302,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1411,7 +1411,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_srem_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s8, s3, 31 @@ -1984,7 +1984,7 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -2016,7 +2016,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_srem24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -2054,7 +2054,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -2085,7 +2085,7 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_srem24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index 03d1dddd7b6061..29488579c15537 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: lshr_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -64,7 +64,7 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -83,7 +83,7 @@ define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: lshr_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -124,7 +124,7 @@ define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -146,7 +146,7 @@ define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: lshr_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -194,7 +194,7 @@ define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -212,7 +212,7 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: lshr_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -258,7 +258,7 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -284,7 +284,7 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: lshr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; VI-NEXT: s_mov_b32 s19, 0xf000 @@ -370,25 +370,25 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: s_lshr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0x14 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_lshr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x50 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x50 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -411,7 +411,7 @@ define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_lshr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -428,7 +428,7 @@ define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index ebc82248cc80c1..5641c43c40084c 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -10,12 +10,12 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr addrspace(1) %input, ptr addrspace(1) %output, i32 %i) { ; MUBUF-LABEL: kernel_background_evaluate: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_load_dword s0, s[2:3], 0x24 +; MUBUF-NEXT: s_load_dword s0, s[4:5], 0x24 ; MUBUF-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; MUBUF-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; MUBUF-NEXT: s_mov_b32 s38, -1 ; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000 -; MUBUF-NEXT: s_add_u32 s36, s36, s9 +; MUBUF-NEXT: s_add_u32 s36, s36, s11 ; MUBUF-NEXT: s_addc_u32 s37, s37, 0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 @@ -48,12 +48,12 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; ; FLATSCR-LABEL: kernel_background_evaluate: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 s6, s6, s11 +; FLATSCR-NEXT: s_add_u32 s8, s8, s13 ; FLATSCR-NEXT: s_movk_i32 s32, 0x6000 -; FLATSCR-NEXT: s_addc_u32 s7, s7, 0 -; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; FLATSCR-NEXT: s_load_dword s2, s[2:3], 0x24 +; FLATSCR-NEXT: s_addc_u32 s9, s9, 0 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; FLATSCR-NEXT: s_load_dword s2, s[4:5], 0x24 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 0 @@ -81,7 +81,7 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; ; MUBUF11-LABEL: kernel_background_evaluate: ; MUBUF11: ; %bb.0: ; %entry -; MUBUF11-NEXT: s_load_b32 s2, s[2:3], 0x24 +; MUBUF11-NEXT: s_load_b32 s2, s[4:5], 0x24 ; MUBUF11-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0 ; MUBUF11-NEXT: v_mov_b32_e32 v4, 0x400000 @@ -108,7 +108,7 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; ; FLATSCR11-LABEL: kernel_background_evaluate: ; FLATSCR11: ; %bb.0: ; %entry -; FLATSCR11-NEXT: s_load_b32 s2, s[2:3], 0x24 +; FLATSCR11-NEXT: s_load_b32 s2, s[4:5], 0x24 ; FLATSCR11-NEXT: v_mov_b32_e32 v1, 0x2000 ; FLATSCR11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0 ; FLATSCR11-NEXT: v_mov_b32_e32 v4, 0x400000 diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index 6b0fbc44c65b7f..da99052ba69ba6 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -127,7 +127,7 @@ define amdgpu_kernel void @kernel_store_stacksave_nocall() { ; WAVE32-OPT-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; WAVE32-OPT-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-OPT-NEXT: s_bitset0_b32 s15, 21 -; WAVE32-OPT-NEXT: s_add_u32 s12, s12, s9 +; WAVE32-OPT-NEXT: s_add_u32 s12, s12, s11 ; WAVE32-OPT-NEXT: s_addc_u32 s13, s13, 0 ; WAVE32-OPT-NEXT: s_lshr_b32 s0, s32, 5 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v1, s0 @@ -141,7 +141,7 @@ define amdgpu_kernel void @kernel_store_stacksave_nocall() { ; WAVE64-OPT-NEXT: v_mov_b32_e32 v0, 0 ; WAVE64-OPT-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; WAVE64-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE64-OPT-NEXT: s_add_u32 s12, s12, s9 +; WAVE64-OPT-NEXT: s_add_u32 s12, s12, s11 ; WAVE64-OPT-NEXT: s_addc_u32 s13, s13, 0 ; WAVE64-OPT-NEXT: s_lshr_b32 s0, s32, 6 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v1, s0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @kernel_store_stacksave_nocall() { ; WAVE32-O0-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; WAVE32-O0-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-O0-NEXT: s_bitset0_b32 s15, 21 -; WAVE32-O0-NEXT: s_add_u32 s12, s12, s9 +; WAVE32-O0-NEXT: s_add_u32 s12, s12, s11 ; WAVE32-O0-NEXT: s_addc_u32 s13, s13, 0 ; WAVE32-O0-NEXT: s_mov_b32 s0, s32 ; WAVE32-O0-NEXT: s_lshr_b32 s0, s0, 5 @@ -170,7 +170,7 @@ define amdgpu_kernel void @kernel_store_stacksave_nocall() { ; WAVE64-O0-NEXT: s_mov_b32 s12, s0 ; WAVE64-O0-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; WAVE64-O0-NEXT: s_waitcnt lgkmcnt(0) -; WAVE64-O0-NEXT: s_add_u32 s12, s12, s9 +; WAVE64-O0-NEXT: s_add_u32 s12, s12, s11 ; WAVE64-O0-NEXT: s_addc_u32 s13, s13, 0 ; WAVE64-O0-NEXT: s_mov_b32 s0, s32 ; WAVE64-O0-NEXT: s_lshr_b32 s0, s0, 6 @@ -186,7 +186,7 @@ define amdgpu_kernel void @kernel_store_stacksave_nocall() { ; WAVE32-WWM-PREALLOC-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s15, 21 -; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s12, s12, s9 +; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s12, s12, s11 ; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s13, s13, 0 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s0, s32 ; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s0, s0, 5 @@ -790,7 +790,7 @@ define amdgpu_gfx void @func_stacksave_sgpr(ptr addrspace(5) inreg %stack) { define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; WAVE32-OPT-LABEL: kernel_stacksave_sgpr: ; WAVE32-OPT: ; %bb.0: -; WAVE32-OPT-NEXT: s_load_dword s0, s[2:3], 0x0 +; WAVE32-OPT-NEXT: s_load_dword s0, s[4:5], 0x0 ; WAVE32-OPT-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-OPT-NEXT: ;;#ASMSTART ; WAVE32-OPT-NEXT: ; use s0 @@ -799,7 +799,7 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; ; WAVE64-OPT-LABEL: kernel_stacksave_sgpr: ; WAVE64-OPT: ; %bb.0: -; WAVE64-OPT-NEXT: s_load_dword s0, s[2:3], 0x0 +; WAVE64-OPT-NEXT: s_load_dword s0, s[4:5], 0x0 ; WAVE64-OPT-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-OPT-NEXT: ;;#ASMSTART ; WAVE64-OPT-NEXT: ; use s0 @@ -808,7 +808,7 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; ; WAVE32-O0-LABEL: kernel_stacksave_sgpr: ; WAVE32-O0: ; %bb.0: -; WAVE32-O0-NEXT: s_load_dword s0, s[2:3], 0x0 +; WAVE32-O0-NEXT: s_load_dword s0, s[4:5], 0x0 ; WAVE32-O0-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-O0-NEXT: s_mov_b32 s1, s0 ; WAVE32-O0-NEXT: ;;#ASMSTART @@ -820,7 +820,7 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; ; WAVE64-O0-LABEL: kernel_stacksave_sgpr: ; WAVE64-O0: ; %bb.0: -; WAVE64-O0-NEXT: s_load_dword s0, s[2:3], 0x0 +; WAVE64-O0-NEXT: s_load_dword s0, s[4:5], 0x0 ; WAVE64-O0-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-O0-NEXT: s_mov_b32 s1, s0 ; WAVE64-O0-NEXT: ;;#ASMSTART @@ -832,7 +832,7 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; ; WAVE32-WWM-PREALLOC-LABEL: kernel_stacksave_sgpr: ; WAVE32-WWM-PREALLOC: ; %bb.0: -; WAVE32-WWM-PREALLOC-NEXT: s_load_dword s0, s[2:3], 0x0 +; WAVE32-WWM-PREALLOC-NEXT: s_load_dword s0, s[4:5], 0x0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s1, s0 ; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMSTART @@ -855,23 +855,24 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE32-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; WAVE32-OPT-NEXT: s_movk_i32 s32, 0x1200 -; WAVE32-OPT-NEXT: s_mov_b64 s[10:11], s[4:5] +; WAVE32-OPT-NEXT: s_mov_b32 s13, s9 +; WAVE32-OPT-NEXT: s_mov_b32 s12, s8 +; WAVE32-OPT-NEXT: s_mov_b64 s[8:9], s[4:5] ; WAVE32-OPT-NEXT: s_mov_b32 s4, s32 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v3, 42 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v4, 17 ; WAVE32-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 -; WAVE32-OPT-NEXT: s_mov_b32 s14, s8 +; WAVE32-OPT-NEXT: s_mov_b32 s14, s10 ; WAVE32-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi ; WAVE32-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo -; WAVE32-OPT-NEXT: s_mov_b32 s12, s6 -; WAVE32-OPT-NEXT: s_mov_b32 s13, s7 ; WAVE32-OPT-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-OPT-NEXT: s_bitset0_b32 s23, 21 -; WAVE32-OPT-NEXT: s_add_u32 s20, s20, s9 +; WAVE32-OPT-NEXT: s_add_u32 s20, s20, s11 ; WAVE32-OPT-NEXT: s_addc_u32 s21, s21, 0 +; WAVE32-OPT-NEXT: s_mov_b64 s[10:11], s[6:7] ; WAVE32-OPT-NEXT: s_lshr_b32 s15, s4, 5 ; WAVE32-OPT-NEXT: s_mov_b64 s[4:5], s[0:1] -; WAVE32-OPT-NEXT: s_mov_b64 s[8:9], s[2:3] +; WAVE32-OPT-NEXT: s_mov_b64 s[6:7], s[2:3] ; WAVE32-OPT-NEXT: s_mov_b64 s[0:1], s[20:21] ; WAVE32-OPT-NEXT: s_mov_b64 s[2:3], s[22:23] ; WAVE32-OPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 @@ -891,22 +892,23 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE64-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; WAVE64-OPT-NEXT: s_movk_i32 s32, 0x2400 -; WAVE64-OPT-NEXT: s_mov_b64 s[10:11], s[4:5] +; WAVE64-OPT-NEXT: s_mov_b32 s13, s9 +; WAVE64-OPT-NEXT: s_mov_b32 s12, s8 +; WAVE64-OPT-NEXT: s_mov_b64 s[8:9], s[4:5] ; WAVE64-OPT-NEXT: s_mov_b32 s4, s32 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v3, 42 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v4, 17 ; WAVE64-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 -; WAVE64-OPT-NEXT: s_mov_b32 s14, s8 +; WAVE64-OPT-NEXT: s_mov_b32 s14, s10 ; WAVE64-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi ; WAVE64-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo -; WAVE64-OPT-NEXT: s_mov_b32 s12, s6 -; WAVE64-OPT-NEXT: s_mov_b32 s13, s7 ; WAVE64-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE64-OPT-NEXT: s_add_u32 s20, s20, s9 +; WAVE64-OPT-NEXT: s_add_u32 s20, s20, s11 ; WAVE64-OPT-NEXT: s_addc_u32 s21, s21, 0 +; WAVE64-OPT-NEXT: s_mov_b64 s[10:11], s[6:7] ; WAVE64-OPT-NEXT: s_lshr_b32 s15, s4, 6 ; WAVE64-OPT-NEXT: s_mov_b64 s[4:5], s[0:1] -; WAVE64-OPT-NEXT: s_mov_b64 s[8:9], s[2:3] +; WAVE64-OPT-NEXT: s_mov_b64 s[6:7], s[2:3] ; WAVE64-OPT-NEXT: s_mov_b64 s[0:1], s[20:21] ; WAVE64-OPT-NEXT: s_mov_b64 s[2:3], s[22:23] ; WAVE64-OPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 @@ -926,13 +928,14 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE32-O0-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-O0-NEXT: s_bitset0_b32 s23, 21 -; WAVE32-O0-NEXT: s_add_u32 s20, s20, s9 +; WAVE32-O0-NEXT: s_add_u32 s20, s20, s11 ; WAVE32-O0-NEXT: s_addc_u32 s21, s21, 0 -; WAVE32-O0-NEXT: s_mov_b32 s14, s8 -; WAVE32-O0-NEXT: s_mov_b32 s13, s7 -; WAVE32-O0-NEXT: s_mov_b32 s12, s6 -; WAVE32-O0-NEXT: s_mov_b64 s[10:11], s[4:5] -; WAVE32-O0-NEXT: s_mov_b64 s[8:9], s[2:3] +; WAVE32-O0-NEXT: s_mov_b32 s14, s10 +; WAVE32-O0-NEXT: s_mov_b32 s13, s9 +; WAVE32-O0-NEXT: s_mov_b32 s12, s8 +; WAVE32-O0-NEXT: s_mov_b64 s[10:11], s[6:7] +; WAVE32-O0-NEXT: s_mov_b64 s[8:9], s[4:5] +; WAVE32-O0-NEXT: s_mov_b64 s[6:7], s[2:3] ; WAVE32-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; WAVE32-O0-NEXT: s_mov_b32 s0, s32 ; WAVE32-O0-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane @@ -944,19 +947,18 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[20:21] ; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[22:23] -; WAVE32-O0-NEXT: s_mov_b32 s6, s32 +; WAVE32-O0-NEXT: s_mov_b32 s15, s32 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 17 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], s6 offset:4 -; WAVE32-O0-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], s15 offset:4 +; WAVE32-O0-NEXT: s_mov_b32 s15, stack_passed_argument@abs32@hi ; WAVE32-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 -; WAVE32-O0-NEXT: s_mov_b32 s17, s6 -; WAVE32-O0-NEXT: s_mov_b32 s6, 20 -; WAVE32-O0-NEXT: v_lshlrev_b32_e64 v2, s6, v2 -; WAVE32-O0-NEXT: s_mov_b32 s6, 10 -; WAVE32-O0-NEXT: v_lshlrev_b32_e64 v1, s6, v1 +; WAVE32-O0-NEXT: s_mov_b32 s17, s15 +; WAVE32-O0-NEXT: s_mov_b32 s15, 20 +; WAVE32-O0-NEXT: v_lshlrev_b32_e64 v2, s15, v2 +; WAVE32-O0-NEXT: s_mov_b32 s15, 10 +; WAVE32-O0-NEXT: v_lshlrev_b32_e64 v1, s15, v1 ; WAVE32-O0-NEXT: v_or3_b32 v31, v0, v1, v2 -; WAVE32-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; WAVE32-O0-NEXT: ; implicit-def: $sgpr15 ; WAVE32-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE32-O0-NEXT: v_mov_b32_e32 v0, s18 @@ -1036,13 +1038,14 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: s_mov_b32 s24, s0 ; WAVE64-O0-NEXT: s_load_dwordx4 s[24:27], s[24:25], 0x0 ; WAVE64-O0-NEXT: s_waitcnt lgkmcnt(0) -; WAVE64-O0-NEXT: s_add_u32 s24, s24, s9 +; WAVE64-O0-NEXT: s_add_u32 s24, s24, s11 ; WAVE64-O0-NEXT: s_addc_u32 s25, s25, 0 -; WAVE64-O0-NEXT: s_mov_b32 s14, s8 -; WAVE64-O0-NEXT: s_mov_b32 s13, s7 -; WAVE64-O0-NEXT: s_mov_b32 s12, s6 -; WAVE64-O0-NEXT: s_mov_b64 s[10:11], s[4:5] -; WAVE64-O0-NEXT: s_mov_b64 s[8:9], s[2:3] +; WAVE64-O0-NEXT: s_mov_b32 s14, s10 +; WAVE64-O0-NEXT: s_mov_b32 s13, s9 +; WAVE64-O0-NEXT: s_mov_b32 s12, s8 +; WAVE64-O0-NEXT: s_mov_b64 s[10:11], s[6:7] +; WAVE64-O0-NEXT: s_mov_b64 s[8:9], s[4:5] +; WAVE64-O0-NEXT: s_mov_b64 s[6:7], s[2:3] ; WAVE64-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; WAVE64-O0-NEXT: s_mov_b32 s0, s32 ; WAVE64-O0-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane @@ -1054,19 +1057,18 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE64-O0-NEXT: s_mov_b64 s[0:1], s[24:25] ; WAVE64-O0-NEXT: s_mov_b64 s[2:3], s[26:27] -; WAVE64-O0-NEXT: s_mov_b32 s6, s32 +; WAVE64-O0-NEXT: s_mov_b32 s15, s32 ; WAVE64-O0-NEXT: v_mov_b32_e32 v3, 17 -; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], s6 offset:4 -; WAVE64-O0-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi +; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], s15 offset:4 +; WAVE64-O0-NEXT: s_mov_b32 s15, stack_passed_argument@abs32@hi ; WAVE64-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE64-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 -; WAVE64-O0-NEXT: s_mov_b32 s17, s6 -; WAVE64-O0-NEXT: s_mov_b32 s6, 20 -; WAVE64-O0-NEXT: v_lshlrev_b32_e64 v2, s6, v2 -; WAVE64-O0-NEXT: s_mov_b32 s6, 10 -; WAVE64-O0-NEXT: v_lshlrev_b32_e64 v1, s6, v1 +; WAVE64-O0-NEXT: s_mov_b32 s17, s15 +; WAVE64-O0-NEXT: s_mov_b32 s15, 20 +; WAVE64-O0-NEXT: v_lshlrev_b32_e64 v2, s15, v2 +; WAVE64-O0-NEXT: s_mov_b32 s15, 10 +; WAVE64-O0-NEXT: v_lshlrev_b32_e64 v1, s15, v1 ; WAVE64-O0-NEXT: v_or3_b32 v31, v0, v1, v2 -; WAVE64-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; WAVE64-O0-NEXT: ; implicit-def: $sgpr15 ; WAVE64-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE64-O0-NEXT: v_mov_b32_e32 v0, s18 @@ -1147,13 +1149,14 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s23, 21 -; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s20, s20, s9 +; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s20, s20, s11 ; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s21, s21, 0 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s14, s8 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s13, s7 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s12, s6 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[10:11], s[4:5] -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[8:9], s[2:3] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s14, s10 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s13, s9 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s12, s8 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[10:11], s[6:7] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[8:9], s[4:5] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[6:7], s[2:3] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[4:5], s[0:1] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s0, s32 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane @@ -1165,19 +1168,18 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[20:21] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[22:23] -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, s32 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s15, s32 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 17 -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], s6 offset:4 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], s15 offset:4 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s15, stack_passed_argument@abs32@hi ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-WWM-PREALLOC-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s17, s6 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, 20 -; WAVE32-WWM-PREALLOC-NEXT: v_lshlrev_b32_e64 v2, s6, v2 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, 10 -; WAVE32-WWM-PREALLOC-NEXT: v_lshlrev_b32_e64 v1, s6, v1 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s17, s15 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s15, 20 +; WAVE32-WWM-PREALLOC-NEXT: v_lshlrev_b32_e64 v2, s15, v2 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s15, 10 +; WAVE32-WWM-PREALLOC-NEXT: v_lshlrev_b32_e64 v1, s15, v1 ; WAVE32-WWM-PREALLOC-NEXT: v_or3_b32 v31, v0, v1, v2 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr6_sgpr7 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr15 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v0, s18 diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll index f7eb760fda084f..76ed4f6238dbed 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -8,24 +8,24 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 @@ -35,39 +35,39 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; ; GFX6-LABEL: store_lds_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_mov_b32_e32 v4, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: v_mov_b32_e32 v4, s6 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: ds_store_b128 v4, v[0:3] @@ -79,43 +79,43 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 ; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_lshr_b32 s0, s7, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s4, s3, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s7, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_lshr_b32 s3, s3, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s6, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s6, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s5, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s5, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 @@ -123,11 +123,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:12 @@ -176,11 +176,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:12 @@ -230,26 +230,25 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: s_lshr_b32 s3, s6, 24 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: s_lshr_b32 s0, s7, 8 -; GFX10-NEXT: s_lshr_b32 s2, s6, 8 -; GFX10-NEXT: s_lshr_b32 s6, s5, 8 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: s_lshr_b32 s1, s7, 24 -; GFX10-NEXT: s_lshr_b32 s5, s5, 24 -; GFX10-NEXT: v_mov_b32_e32 v8, s3 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_lshr_b32 s5, s2, 8 +; GFX10-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: s_lshr_b32 s4, s3, 8 +; GFX10-NEXT: s_lshr_b32 s3, s3, 24 +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: v_mov_b32_e32 v7, s5 ; GFX10-NEXT: v_mov_b32_e32 v9, s6 -; GFX10-NEXT: s_lshr_b32 s0, s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:12 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:8 @@ -261,10 +260,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:13 ; GFX10-NEXT: ds_write_b8 v0, v6 offset:15 ; GFX10-NEXT: ds_write_b8 v0, v7 offset:9 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: s_lshr_b32 s1, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_lshr_b32 s1, s0, 8 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: ds_write_b8 v0, v8 offset:11 ; GFX10-NEXT: ds_write_b8 v0, v9 offset:5 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:7 @@ -275,10 +275,10 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_lshr_b32 s4, s3, 8 ; GFX11-NEXT: s_lshr_b32 s3, s3, 24 ; GFX11-NEXT: s_lshr_b32 s5, s2, 8 @@ -317,31 +317,31 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_write_b16 v0, v1 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:12 @@ -366,11 +366,11 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:12 @@ -396,14 +396,14 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 ; GFX10-NEXT: ds_write_b16 v0, v2 offset:8 @@ -417,10 +417,10 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_mov_b32_e32 v4, s2 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:14 @@ -439,25 +439,25 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 @@ -468,11 +468,11 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s3 ; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 @@ -484,14 +484,14 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s7 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 ; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 ; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 ; GFX10-NEXT: s_endpgm @@ -499,10 +499,10 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_mov_b32_e32 v4, s3 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 @@ -515,24 +515,24 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -542,39 +542,39 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, s6 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: v_mov_b32_e32 v4, s6 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s1 @@ -587,24 +587,24 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 @@ -614,39 +614,39 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; ; GFX6-LABEL: store_lds_v4i32_align16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_mov_b32_e32 v4, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: v_mov_b32_e32 v4, s6 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: ds_store_b128 v4, v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll index 64ce67a1a3deeb..70906d8474aa50 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -8,36 +8,39 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: ds_write_b96 v3, v[0:2] ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 @@ -47,25 +50,26 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) ; ; GFX10-LABEL: store_lds_v3i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: ds_store_b96 v3, v[0:2] ; GFX11-NEXT: s_endpgm store <3 x i32> %x, ptr addrspace(3) %out @@ -75,34 +79,34 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 ; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_lshr_b32 s0, s6, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s6, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s5, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s5, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 @@ -110,11 +114,11 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 @@ -152,11 +156,11 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 @@ -195,25 +199,25 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_lshr_b32 s0, s6, 8 -; GFX10-NEXT: s_lshr_b32 s1, s6, 24 -; GFX10-NEXT: s_lshr_b32 s2, s5, 8 -; GFX10-NEXT: s_lshr_b32 s3, s5, 24 -; GFX10-NEXT: s_lshr_b32 s5, s4, 8 -; GFX10-NEXT: s_lshr_b32 s4, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v5, s1 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: s_lshr_b32 s3, s2, 8 +; GFX10-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-NEXT: s_lshr_b32 s4, s1, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_lshr_b32 s5, s0, 8 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v8, s5 -; GFX10-NEXT: v_mov_b32_e32 v9, s4 +; GFX10-NEXT: v_mov_b32_e32 v9, s0 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 @@ -231,10 +235,10 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 ; GFX11-NEXT: s_lshr_b32 s3, s2, 8 ; GFX11-NEXT: s_lshr_b32 s2, s2, 24 @@ -265,28 +269,28 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: ds_write_b16 v0, v1 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 @@ -306,11 +310,11 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 @@ -331,13 +335,13 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 ; GFX10-NEXT: ds_write_b16 v0, v2 offset:4 @@ -349,10 +353,10 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10 ; GFX11-NEXT: ds_store_b16 v0, v2 @@ -368,24 +372,24 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX9-NEXT: ds_write_b32 v0, v3 offset:8 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 @@ -395,11 +399,11 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 offset:8 @@ -410,13 +414,13 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:8 ; GFX10-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 ; GFX10-NEXT: s_endpgm @@ -424,10 +428,10 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 ; GFX11-NEXT: ds_store_b32 v0, v3 offset:8 @@ -439,24 +443,24 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b32 v2, v1 offset:8 @@ -466,11 +470,11 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 @@ -481,13 +485,13 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX10-NEXT: ds_write_b64 v2, v[0:1] ; GFX10-NEXT: s_endpgm @@ -495,10 +499,10 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: ds_store_b32 v2, v3 offset:8 ; GFX11-NEXT: ds_store_b64 v2, v[0:1] @@ -510,36 +514,39 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: ds_write_b96 v3, v[0:2] ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 @@ -549,25 +556,26 @@ define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i ; ; GFX10-LABEL: store_lds_v3i32_align16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: ds_store_b96 v3, v[0:2] ; GFX11-NEXT: s_endpgm store <3 x i32> %x, ptr addrspace(3) %out, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 3644bef9c20a1f..f791135d45e9aa 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -50,12 +50,12 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s6, 14 +; HAWAII-NEXT: s_or_b32 s0, s8, 14 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s7 +; HAWAII-NEXT: v_mov_b32_e32 v1, s9 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[8:9], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v1, s2 @@ -70,12 +70,12 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s6, 14 +; FIJI-NEXT: s_or_b32 s0, s8, 14 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s7 +; FIJI-NEXT: v_mov_b32_e32 v1, s9 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; FIJI-NEXT: s_load_dword s2, s[6:7], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; FIJI-NEXT: s_load_dword s2, s[8:9], 0x0 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_and_b32 s3, s1, 0xffff @@ -94,9 +94,9 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[6:7] offset:14 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[8:9] offset:14 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -114,9 +114,9 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[6:7] offset:14 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[8:9] offset:14 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s3, s1, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v1, s2 @@ -132,15 +132,15 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; GFX11-LABEL: local_store_i55: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[2:3] offset:14 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s3, s1, 0xffff -; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_mov_b32_e32 v3, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[4:5] offset:14 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -156,8 +156,8 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[8:9], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v0, s2 @@ -169,8 +169,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[6:7], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; FIJI-NEXT: s_load_dword s2, s[8:9], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, s2 @@ -182,8 +182,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -195,8 +195,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX10-LABEL: local_store_i48: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -208,10 +208,10 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX11-LABEL: local_store_i48: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:4 ; GFX11-NEXT: ds_store_b32 v0, v2 @@ -223,9 +223,9 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x4 -; HAWAII-NEXT: s_load_dword s3, s[6:7], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[8:9], 0x4 +; HAWAII-NEXT: s_load_dword s3, s[8:9], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: s_and_b32 s2, s2, 1 @@ -239,9 +239,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[6:7], 0x10 -; FIJI-NEXT: s_load_dword s3, s[6:7], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; FIJI-NEXT: s_load_dword s2, s[8:9], 0x10 +; FIJI-NEXT: s_load_dword s3, s[8:9], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_and_b32 s2, s2, 1 @@ -255,9 +255,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x10 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x10 +; GFX9-NEXT: s_load_dword s3, s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 @@ -271,9 +271,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX10-LABEL: local_store_i65: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x10 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x10 +; GFX10-NEXT: s_load_dword s3, s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s3 @@ -287,13 +287,13 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX11-LABEL: local_store_i65: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x10 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 1 +; GFX11-NEXT: s_and_b32 s2, s2, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: ds_store_b8 v2, v3 offset:8 ; GFX11-NEXT: ds_store_b64 v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 85dd5581f287b2..80ccd1ffe02948 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: s_sub_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22,7 +22,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX8-LABEL: s_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s2, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -33,17 +33,17 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX9-LABEL: s_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s0, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, s2, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -58,22 +58,22 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: s_sub_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sub_i32 s4, 0x4d2, s4 +; GFX6-NEXT: s_sub_i32 s4, 0x4d2, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_sub_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s4 +; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -82,18 +82,18 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX9-LABEL: s_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, 0x4d2, s4 +; GFX9-NEXT: s_sub_i32 s2, 0x4d2, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, 0x4d2, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -108,7 +108,7 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -126,7 +126,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: test_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -140,18 +140,18 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -170,7 +170,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -188,7 +188,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: test_sub_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -202,18 +202,18 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: test_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v1, 0x7b, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -230,7 +230,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -249,7 +249,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -264,19 +264,19 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -296,7 +296,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -318,7 +318,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -340,22 +340,22 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, v6, v2 ; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v4, v0 -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 @@ -379,7 +379,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -400,7 +400,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: test_sub_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -420,21 +420,21 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_sub_u16_e32 v1, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -460,7 +460,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -485,7 +485,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -503,19 +503,19 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 -; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -539,7 +539,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -571,7 +571,7 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -592,20 +592,20 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: test_sub_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -630,55 +630,55 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind { ; GFX6-LABEL: s_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sub_u32 s4, s4, s6 -; GFX6-NEXT: s_subb_u32 s5, s5, s7 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_sub_u32 s0, s0, s2 +; GFX6-NEXT: s_subb_u32 s1, s1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_u32 s2, s4, s6 -; GFX8-NEXT: s_subb_u32 s3, s5, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_sub_u32 s0, s0, s2 +; GFX8-NEXT: s_subb_u32 s1, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s2, s4, s6 -; GFX9-NEXT: s_subb_u32 s3, s5, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_sub_u32 s0, s0, s2 +; GFX9-NEXT: s_subb_u32 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[4:5], s[6:7] +; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-NEXT: s_endpgm %result = sub i64 %a, %b store i64 %result, ptr addrspace(1) %out, align 8 @@ -688,21 +688,21 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind { ; GFX6-LABEL: v_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX6-NEXT: s_mov_b64 s[6:7], s[14:15] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3] +; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64 ; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s8, s0 +; GFX6-NEXT: s_mov_b32 s9, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc @@ -711,58 +711,58 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX8-LABEL: v_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: v_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7] -; GFX12-NEXT: global_load_b64 v[2:3], v2, s[0:1] +; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX12-NEXT: global_load_b64 v[2:3], v2, s[4:5] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX12-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid @@ -777,21 +777,21 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX6-NEXT: s_mov_b64 s[6:7], s[14:15] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3] +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 ; GFX6-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[12:15], 0 addr64 ; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s8, s0 +; GFX6-NEXT: s_mov_b32 s9, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v6, v2 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc @@ -802,15 +802,15 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: v_test_sub_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -819,47 +819,47 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_sub_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: v_test_sub_v2i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7] -; GFX12-NEXT: global_load_b128 v[4:7], v4, s[0:1] +; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX12-NEXT: global_load_b128 v[4:7], v4, s[4:5] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo -; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid @@ -874,23 +874,23 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3] ; GFX6-NEXT: v_lshlrev_b32_e32 v12, 5, v0 ; GFX6-NEXT: v_mov_b32_e32 v13, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX6-NEXT: s_mov_b64 s[6:7], s[14:15] ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[12:13], s[12:15], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[0:3], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[0:3], 0 addr64 offset:16 +; GFX6-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[4:7], 0 addr64 offset:16 ; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[12:13], s[12:15], 0 addr64 offset:16 ; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s8, s0 +; GFX6-NEXT: s_mov_b32 s9, s1 ; GFX6-NEXT: s_waitcnt vmcnt(2) ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc @@ -907,15 +907,15 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: v_test_sub_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v0 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[8:9] ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[12:13] @@ -925,10 +925,10 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-NEXT: v_mov_b32_e32 v17, s5 -; GFX8-NEXT: v_mov_b32_e32 v16, s4 -; GFX8-NEXT: s_add_u32 s0, s4, 16 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc @@ -947,14 +947,14 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_sub_v4i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] -; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:16 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 @@ -966,25 +966,25 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v11, v15, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v8, v12 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v13, vcc -; GFX9-NEXT: global_store_dwordx4 v16, v[4:7], s[4:5] offset:16 -; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: v_test_sub_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_load_b128 v[0:3], v12, s[6:7] -; GFX12-NEXT: global_load_b128 v[4:7], v12, s[0:1] -; GFX12-NEXT: global_load_b128 v[8:11], v12, s[6:7] offset:16 -; GFX12-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:16 +; GFX12-NEXT: global_load_b128 v[0:3], v12, s[2:3] +; GFX12-NEXT: global_load_b128 v[4:7], v12, s[4:5] +; GFX12-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:16 +; GFX12-NEXT: global_load_b128 v[12:15], v12, s[4:5] offset:16 ; GFX12-NEXT: s_wait_loadcnt 0x2 ; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo @@ -996,8 +996,8 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 ; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16 -; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inA, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 18a94f75b6d948..c9547e2c68c82c 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -8,79 +8,79 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v1, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_sub_u16_e32 v2, v0, v1 ; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_sub_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid @@ -96,72 +96,72 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; GFX9-LABEL: s_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_pk_sub_i16 v0, s9, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s6, s[6:7], 0x0 -; VI-NEXT: s_load_dword s7, s[0:1], 0x0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-NEXT: s_load_dword s3, s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s6, 16 -; VI-NEXT: s_lshr_b32 s5, s7, 16 -; VI-NEXT: s_sub_i32 s6, s6, s7 -; VI-NEXT: s_sub_i32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s6, 0xffff -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: s_sub_i32 s2, s2, s3 +; VI-NEXT: s_sub_i32 s0, s0, s1 +; VI-NEXT: s_and_b32 s1, s2, 0xffff +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_sub_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: v_pk_sub_i16 v0, s4, s5 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_sub_i16 v0, s2, s0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: v_pk_sub_i16 v0, s2, s4 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm %a = load <2 x i16>, ptr addrspace(4) %in0 %b = load <2 x i16>, ptr addrspace(4) %in1 @@ -173,7 +173,7 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { ; GCN-LABEL: s_test_sub_self_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -183,7 +183,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX10-LABEL: s_test_sub_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 @@ -193,7 +193,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: s_test_sub_self_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -210,20 +210,20 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; GFX9-LABEL: s_test_sub_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_pk_sub_i16 v0, s6, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -242,19 +242,19 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX10-LABEL: s_test_sub_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v0, s6, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -271,21 +271,21 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b +; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -304,21 +304,21 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_sub_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -343,21 +343,21 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3 +; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -376,21 +376,21 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_sub_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_neg_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -414,20 +414,20 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -446,21 +446,21 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -484,20 +484,20 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -515,21 +515,21 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -554,20 +554,20 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 1.0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -585,21 +585,21 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 1.0 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -624,85 +624,85 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_sub_u16_e32 v0, v1, v2 ; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid @@ -720,92 +720,92 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v2, v3 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: v_sub_u16_e32 v0, v4, v2 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid @@ -823,87 +823,87 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v1, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-NEXT: v_bfe_i32 v1, v2, 0, 16 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid @@ -921,14 +921,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 @@ -936,25 +936,25 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -962,22 +962,22 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -985,23 +985,23 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll index 42a1f746be3fad..c0587d260c6f23 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.convergencetokens.ll @@ -10,16 +10,16 @@ declare void @void_func_i64_inreg(i64 inreg) define void @tail_call_i64_inreg_uniform_in_vgpr_convergence_tokens() #0 { ; CHECK-LABEL: name: tail_call_i64_inreg_uniform_in_vgpr_convergence_tokens ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr6_sgpr7, $vgpr31 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr31 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[CONVERGENCECTRL_ENTRY:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ENTRY ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -35,13 +35,13 @@ define void @tail_call_i64_inreg_uniform_in_vgpr_convergence_tokens() #0 { ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @void_func_i64_inreg, target-flags(amdgpu-gotprel32-hi) @void_func_i64_inreg, implicit-def dead $scc ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]] - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY1]] - ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY7]] - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY6]] - ; CHECK-NEXT: $sgpr12 = COPY [[COPY5]] - ; CHECK-NEXT: $sgpr13 = COPY [[COPY4]] - ; CHECK-NEXT: $sgpr14 = COPY [[COPY3]] - ; CHECK-NEXT: $sgpr15 = COPY [[COPY2]] + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY7]] + ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY6]] + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY5]] + ; CHECK-NEXT: $sgpr12 = COPY [[COPY4]] + ; CHECK-NEXT: $sgpr13 = COPY [[COPY3]] + ; CHECK-NEXT: $sgpr14 = COPY [[COPY2]] + ; CHECK-NEXT: $sgpr15 = COPY [[COPY1]] ; CHECK-NEXT: $vgpr31 = COPY [[COPY]] ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll index 9b992c40c28eeb..0689c0585d8a69 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll @@ -6,13 +6,12 @@ declare hidden void @void_func_i32_inreg(i32 inreg) define void @tail_call_i32_inreg_uniform(i32 inreg %sgpr) { ; CHECK-LABEL: tail_call_i32_inreg_uniform: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s0, s6 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, void_func_i32_inreg@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i32_inreg@rel32@hi+12 -; CHECK-NEXT: s_setpc_b64 s[16:17] +; CHECK-NEXT: s_mov_b32 s0, s16 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, void_func_i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, void_func_i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_setpc_b64 s[18:19] tail call void @void_func_i32_inreg(i32 inreg %sgpr) ret void } @@ -23,14 +22,13 @@ define void @indirect_tail_call_i32_inreg_uniform(i32 inreg %sgpr) { ; CHECK-LABEL: indirect_tail_call_i32_inreg_uniform: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, constant@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, constant@rel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; CHECK-NEXT: ; kill: def $sgpr6_sgpr7 killed $sgpr6_sgpr7 killed $sgpr7 -; CHECK-NEXT: s_mov_b32 s0, s6 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, constant@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, constant@rel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 +; CHECK-NEXT: s_mov_b32 s0, s16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[16:17] +; CHECK-NEXT: s_setpc_b64 s[18:19] %fptr = load ptr, ptr addrspace(4) @constant, align 8 tail call void %fptr(i32 inreg %sgpr) ret void @@ -42,14 +40,14 @@ define void @tail_call_i64_inreg_uniform(i64 inreg %sgpr) { ; CHECK-LABEL: tail_call_i64_inreg_uniform: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, void_func_i64_inreg@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i64_inreg@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; CHECK-NEXT: s_mov_b32 s1, s7 -; CHECK-NEXT: s_mov_b32 s0, s6 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, void_func_i64_inreg@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, void_func_i64_inreg@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 +; CHECK-NEXT: s_mov_b32 s1, s17 +; CHECK-NEXT: s_mov_b32 s0, s16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[16:17] +; CHECK-NEXT: s_setpc_b64 s[18:19] tail call void @void_func_i64_inreg(i64 inreg %sgpr) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll index 3ca2adec90be59..ac449f972acb50 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-uniform-target-in-vgprs-issue110930.convergencetokens.ll @@ -11,16 +11,16 @@ target triple = "amdgcn-amd-amdhsa" define void @tail_call_uniform_vgpr_value_convergence_tokens() #0 { ; CHECK-LABEL: name: tail_call_uniform_vgpr_value_convergence_tokens ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr6_sgpr7, $vgpr31 + ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr31 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[CONVERGENCECTRL_ENTRY:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ENTRY ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -35,13 +35,13 @@ define void @tail_call_uniform_vgpr_value_convergence_tokens() #0 { ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec, implicit [[CONVERGENCECTRL_ENTRY]] ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY8]] - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY1]] - ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY7]] - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY6]] - ; CHECK-NEXT: $sgpr12 = COPY [[COPY5]] - ; CHECK-NEXT: $sgpr13 = COPY [[COPY4]] - ; CHECK-NEXT: $sgpr14 = COPY [[COPY3]] - ; CHECK-NEXT: $sgpr15 = COPY [[COPY2]] + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY7]] + ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY6]] + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY5]] + ; CHECK-NEXT: $sgpr12 = COPY [[COPY4]] + ; CHECK-NEXT: $sgpr13 = COPY [[COPY3]] + ; CHECK-NEXT: $sgpr14 = COPY [[COPY2]] + ; CHECK-NEXT: $sgpr15 = COPY [[COPY1]] ; CHECK-NEXT: $vgpr31 = COPY [[COPY]] ; CHECK-NEXT: CONVERGENCECTRL_GLUE [[CONVERGENCECTRL_ENTRY]] ; CHECK-NEXT: SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit [[CONVERGENCECTRL_ENTRY]] diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index 80b0bdd8c03759..aa3e05fdbdb36a 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -85,34 +85,34 @@ define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) { define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture readonly %arg1, ptr addrspace(1) nocapture %arg2) local_unnamed_addr { ; SI-LABEL: truncate_high_elt_extract_vector: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s4, s[4:5], 0x0 -; SI-NEXT: s_load_dword s5, s[6:7], 0x0 +; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s1, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i16 s4, s4 -; SI-NEXT: s_sext_i32_i16 s5, s5 -; SI-NEXT: s_mul_i32 s5, s5, s4 -; SI-NEXT: s_lshr_b32 s4, s5, 16 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_sext_i32_i16 s0, s0 +; SI-NEXT: s_sext_i32_i16 s1, s1 +; SI-NEXT: s_mul_i32 s1, s1, s0 +; SI-NEXT: s_lshr_b32 s0, s1, 16 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: truncate_high_elt_extract_vector: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[4:5], 0x0 -; VI-NEXT: s_load_dword s3, s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_load_dword s1, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s0, s2 -; VI-NEXT: s_sext_i32_i16 s1, s3 +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_sext_i32_i16 s1, s1 ; VI-NEXT: s_mul_i32 s1, s1, s0 ; VI-NEXT: s_lshr_b32 s0, s1, 16 ; VI-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll index 088aff983ddc9c..5f01db82ccd48d 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll @@ -5,98 +5,98 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out, <16 x i32> %in) { ; SI-LABEL: truncstore_arg_v16i32_to_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s1, s18, 0xff -; SI-NEXT: s_lshl_b32 s0, s19, 24 -; SI-NEXT: s_lshl_b32 s1, s1, 16 -; SI-NEXT: s_or_b32 s0, s0, s1 -; SI-NEXT: s_lshl_b32 s1, s17, 8 -; SI-NEXT: s_and_b32 s2, s16, 0xff -; SI-NEXT: s_or_b32 s1, s2, s1 -; SI-NEXT: s_and_b32 s1, s1, 0xffff -; SI-NEXT: s_and_b32 s2, s14, 0xff -; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: s_lshl_b32 s1, s15, 24 -; SI-NEXT: s_lshl_b32 s2, s2, 16 -; SI-NEXT: s_or_b32 s1, s1, s2 -; SI-NEXT: s_lshl_b32 s2, s13, 8 -; SI-NEXT: s_and_b32 s3, s12, 0xff -; SI-NEXT: s_or_b32 s2, s3, s2 -; SI-NEXT: s_and_b32 s2, s2, 0xffff -; SI-NEXT: s_and_b32 s3, s10, 0xff -; SI-NEXT: s_or_b32 s1, s2, s1 -; SI-NEXT: s_lshl_b32 s2, s11, 24 -; SI-NEXT: s_lshl_b32 s3, s3, 16 -; SI-NEXT: s_or_b32 s2, s2, s3 -; SI-NEXT: s_lshl_b32 s3, s9, 8 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_or_b32 s3, s8, s3 -; SI-NEXT: s_and_b32 s3, s3, 0xffff -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_or_b32 s2, s3, s2 -; SI-NEXT: s_lshl_b32 s3, s7, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s4, s23, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_or_b32 s3, s3, s6 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s3, s4, s3 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_mov_b32_e32 v3, s0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_and_b32 s6, s20, 0xff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshl_b32 s6, s17, 8 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_and_b32 s7, s14, 0xff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_lshl_b32 s6, s15, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_lshl_b32 s7, s13, 8 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_or_b32 s7, s12, s7 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_lshl_b32 s7, s11, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: truncstore_arg_v16i32_to_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s19, 8 -; VI-NEXT: s_and_b32 s3, s18, 0xff -; VI-NEXT: s_lshl_b32 s17, s17, 8 -; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s2, s23, 8 +; VI-NEXT: s_and_b32 s3, s22, 0xff +; VI-NEXT: s_lshl_b32 s4, s21, 8 +; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_or_b32 s2, s3, s2 -; VI-NEXT: s_or_b32 s3, s16, s17 +; VI-NEXT: s_or_b32 s3, s5, s4 ; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s3, s3, 0xffff ; VI-NEXT: s_or_b32 s2, s3, s2 -; VI-NEXT: s_lshl_b32 s3, s15, 8 -; VI-NEXT: s_and_b32 s14, s14, 0xff -; VI-NEXT: s_lshl_b32 s13, s13, 8 -; VI-NEXT: s_and_b32 s12, s12, 0xff -; VI-NEXT: s_lshl_b32 s11, s11, 8 -; VI-NEXT: s_and_b32 s10, s10, 0xff -; VI-NEXT: s_lshl_b32 s9, s9, 8 -; VI-NEXT: s_and_b32 s8, s8, 0xff -; VI-NEXT: s_lshl_b32 s7, s7, 8 -; VI-NEXT: s_and_b32 s6, s6, 0xff -; VI-NEXT: s_lshl_b32 s5, s5, 8 -; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_or_b32 s3, s14, s3 -; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s3, s19, 8 +; VI-NEXT: s_and_b32 s4, s18, 0xff +; VI-NEXT: s_or_b32 s3, s4, s3 +; VI-NEXT: s_lshl_b32 s4, s17, 8 +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_and_b32 s12, s12, 0xffff -; VI-NEXT: s_lshl_b32 s10, s10, 16 -; VI-NEXT: s_and_b32 s8, s8, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_or_b32 s3, s12, s3 -; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s3, s4, s3 +; VI-NEXT: s_lshl_b32 s4, s15, 8 +; VI-NEXT: s_and_b32 s5, s14, 0xff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_lshl_b32 s5, s13, 8 +; VI-NEXT: s_and_b32 s6, s12, 0xff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_lshl_b32 s5, s11, 8 +; VI-NEXT: s_and_b32 s6, s10, 0xff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_lshl_b32 s6, s9, 8 +; VI-NEXT: s_and_b32 s7, s8, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -110,9 +110,9 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out, define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out, <16 x i64> %in) { ; SI-LABEL: truncstore_arg_v16i64_to_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x39 -; SI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x29 +; SI-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x39 +; SI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x29 ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s38, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -161,9 +161,9 @@ define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out, ; ; VI-LABEL: truncstore_arg_v16i64_to_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0xe4 -; VI-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0xa4 +; VI-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0xe4 +; VI-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0xa4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s1, s30, 8 ; VI-NEXT: s_and_b32 s3, s28, 0xff diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll index 88bdf6454fe522..db802732e987b5 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @trunc_i64_to_i32_store(ptr addrspace(1) %out, [8 x i32], i64 %in) { ; GCN-LABEL: {{^}}trunc_i64_to_i32_store: -; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[2:3], +; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[4:5], ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]] ; SI: buffer_store_dword [[VLOAD]] ; VI: flat_store_dword v[{{[0-9:]+}}], [[VLOAD]] diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 837b46f0ce578d..12eec4fa3bd594 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -31,26 +31,25 @@ bb: define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i32 %tmp5.i.i, i32 %tmp427.i, i1 %tmp438.i, double %tmp27.i, i1 %tmp48.i) { ; GLOBALNESS1-LABEL: kernel: ; GLOBALNESS1: ; %bb.0: ; %bb -; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[6:7] -; GLOBALNESS1-NEXT: s_load_dwordx4 s[72:75], s[6:7], 0x0 -; GLOBALNESS1-NEXT: s_nop 0 -; GLOBALNESS1-NEXT: s_load_dword s6, s[6:7], 0x14 +; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] +; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0 +; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[72:73] -; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[4:5] -; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[38:39], 0x18 -; GLOBALNESS1-NEXT: s_load_dword s7, s[38:39], 0x20 -; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s15 +; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[76:77] +; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] +; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 +; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 +; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS1-NEXT: s_bitcmp1_b32 s74, 0 +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s78, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 @@ -60,24 +59,25 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[40:41], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3 -; GLOBALNESS1-NEXT: s_mov_b32 s68, s14 -; GLOBALNESS1-NEXT: s_mov_b32 s69, s13 -; GLOBALNESS1-NEXT: s_mov_b32 s70, s12 -; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[8:9] +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 +; GLOBALNESS1-NEXT: s_mov_b32 s70, s16 +; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS1-NEXT: s_mov_b32 s71, s15 +; GLOBALNESS1-NEXT: s_mov_b32 s72, s14 +; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) @@ -89,14 +89,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v3 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[60:61] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -121,26 +121,27 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[36:37] +; GLOBALNESS1-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[44:45] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lt_i32 s75, 1 +; GLOBALNESS1-NEXT: s_cmp_lt_i32 s79, 1 ; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s75, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_8 @@ -150,7 +151,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: .LBB1_8: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s75, 0 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25 @@ -162,15 +163,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[2:3] ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[60:61], 0, v0 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[72:73], s[60:61] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[74:75], s[62:63] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -184,68 +185,70 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[62:63], 0, v2 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS1-NEXT: s_branch .LBB1_16 ; GLOBALNESS1-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[48:49] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[48:49] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[40:41] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[44:45] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[64:65] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67] ; GLOBALNESS1-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_add_u32 s66, s38, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s67, s39, 0 +; GLOBALNESS1-NEXT: s_add_u32 s68, s38, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s69, s39, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 ; GLOBALNESS1-NEXT: s_load_dwordx2 s[76:77], s[4:5], 0x0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[66:67] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[66:67] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -261,12 +264,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[72:73] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[60:61] +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[74:75] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -287,16 +290,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; %bb.31: ; %bb7.i.i ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[36:37] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS1-NEXT: s_getpc_b64 s[6:7] -; GLOBALNESS1-NEXT: s_add_u32 s6, s6, widget@rel32@lo+4 -; GLOBALNESS1-NEXT: s_addc_u32 s7, s7, widget@rel32@hi+12 -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] +; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 +; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS1-NEXT: .LBB1_32: ; %Flow ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -304,36 +308,36 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; %bb.33: ; %bb11.i.i ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[36:37] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS1-NEXT: s_getpc_b64 s[6:7] -; GLOBALNESS1-NEXT: s_add_u32 s6, s6, widget@rel32@lo+4 -; GLOBALNESS1-NEXT: s_addc_u32 s7, s7, widget@rel32@hi+12 -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] +; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 +; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS1-NEXT: .LBB1_34: ; %UnifiedUnreachableBlock ; ; GLOBALNESS0-LABEL: kernel: ; GLOBALNESS0: ; %bb.0: ; %bb -; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[6:7] -; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[6:7], 0x0 -; GLOBALNESS0-NEXT: s_nop 0 -; GLOBALNESS0-NEXT: s_load_dword s6, s[6:7], 0x14 +; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] +; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0 +; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[72:73] -; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[4:5] -; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[38:39], 0x18 -; GLOBALNESS0-NEXT: s_load_dword s7, s[38:39], 0x20 -; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s15 +; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] +; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 +; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 +; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400 @@ -347,24 +351,25 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[40:41], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v3 -; GLOBALNESS0-NEXT: s_mov_b32 s66, s14 -; GLOBALNESS0-NEXT: s_mov_b32 s67, s13 -; GLOBALNESS0-NEXT: s_mov_b32 s68, s12 -; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[8:9] +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 +; GLOBALNESS0-NEXT: s_mov_b32 s68, s16 +; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS0-NEXT: s_mov_b32 s69, s15 +; GLOBALNESS0-NEXT: s_mov_b32 s70, s14 +; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) @@ -376,14 +381,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v3 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[60:61] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -408,16 +413,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS0-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[36:37] +; GLOBALNESS0-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s68 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s67 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s66 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[44:45] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 @@ -449,15 +455,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[2:3] ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[60:61], 0, v0 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[72:73], s[60:61] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[76:77], s[62:63] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -471,68 +477,70 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[62:63], 0, v2 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS0-NEXT: s_branch .LBB1_16 ; GLOBALNESS0-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[48:49] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[48:49] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[40:41] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[44:45] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[64:65] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67] ; GLOBALNESS0-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_add_u32 s70, s38, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s71, s39, 0 +; GLOBALNESS0-NEXT: s_add_u32 s72, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s73, s39, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS0-NEXT: s_load_dwordx2 s[76:77], s[4:5], 0x0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[70:71] +; GLOBALNESS0-NEXT: s_load_dwordx2 s[78:79], s[4:5], 0x0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s68 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s67 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s66 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[76:77] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[70:71] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s68 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s67 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s66 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -548,12 +556,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[72:73] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[60:61] +; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[76:77] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -574,16 +582,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; %bb.31: ; %bb7.i.i ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[36:37] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s68 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s67 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s66 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS0-NEXT: s_getpc_b64 s[6:7] -; GLOBALNESS0-NEXT: s_add_u32 s6, s6, widget@rel32@lo+4 -; GLOBALNESS0-NEXT: s_addc_u32 s7, s7, widget@rel32@hi+12 -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] +; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 +; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS0-NEXT: .LBB1_32: ; %Flow ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -591,16 +600,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; %bb.33: ; %bb11.i.i ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[36:37] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s68 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s67 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s66 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS0-NEXT: s_getpc_b64 s[6:7] -; GLOBALNESS0-NEXT: s_add_u32 s6, s6, widget@rel32@lo+4 -; GLOBALNESS0-NEXT: s_addc_u32 s7, s7, widget@rel32@hi+12 -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] +; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 +; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS0-NEXT: .LBB1_34: ; %UnifiedUnreachableBlock bb: store i32 0, ptr addrspace(1) null, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index 1fd5f7f8f9bb3b..6606b1d050421c 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -6,60 +6,60 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { ; SI-LABEL: s_uaddo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_add_u32 s4, s6, s8 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_addc_u32 s5, s7, s9 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_add_u32 s0, s2, s8 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_addc_u32 s1, s3, s9 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uaddo_i64_zext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_add_u32 s0, s6, s0 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_addc_u32 s1, s7, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s1, s3, s5 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_uaddo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_add_u32 s4, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_addc_u32 s5, s3, s7 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 @@ -75,35 +75,35 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_uaddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: v_mov_b32_e32 v0, s13 ; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uaddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: flat_store_byte v[2:3], v5 @@ -111,15 +111,15 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_uaddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: global_store_byte v0, v2, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 @@ -132,7 +132,7 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_uaddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -182,16 +182,16 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[8:9] -; GFX9-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NEXT: global_load_dword v1, v0, s[12:13] +; GFX9-NEXT: global_load_dword v2, v0, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: global_store_byte v0, v2, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-NEXT: global_store_byte v0, v2, s[10:11] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -210,7 +210,7 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i32_novcc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -243,7 +243,7 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_uaddo_i32_novcc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -268,19 +268,19 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_uaddo_i32_novcc: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[8:9] -; GFX9-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NEXT: global_load_dword v1, v0, s[12:13] +; GFX9-NEXT: global_load_dword v2, v0, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: global_store_byte v0, v2, s[6:7] +; GFX9-NEXT: global_store_byte v0, v2, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -301,7 +301,7 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; SI-LABEL: s_uaddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -325,7 +325,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_uaddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 @@ -345,19 +345,19 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_uaddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s8, s10 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_addc_u32 s1, s9, s11 +; GFX9-NEXT: s_add_u32 s0, s12, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_addc_u32 s1, s13, s15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] -; GFX9-NEXT: global_store_byte v4, v0, s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] +; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 @@ -370,7 +370,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -401,7 +401,7 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_uaddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -424,18 +424,18 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_byte v4, v0, s[6:7] +; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -454,7 +454,7 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -486,7 +486,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_uaddo_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -510,17 +510,17 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[8:9] -; GFX9-NEXT: global_load_ushort v2, v0, s[10:11] +; GFX9-NEXT: global_load_ushort v1, v0, s[12:13] +; GFX9-NEXT: global_load_ushort v2, v0, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v1, v2 ; GFX9-NEXT: v_cmp_lt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX9-NEXT: global_store_short v0, v2, s[4:5] -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: global_store_short v0, v2, s[8:9] +; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -539,7 +539,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_uaddo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -570,7 +570,7 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_uaddo_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -593,18 +593,18 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_uaddo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4 @@ -620,7 +620,7 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_uaddo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -630,7 +630,7 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 ; SI-NEXT: .LBB8_2: ; %exit -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] @@ -647,7 +647,7 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: s_uaddo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: s_cmp_eq_u32 s0, s1 @@ -657,7 +657,7 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: s_xor_b64 s[0:1], vcc, -1 ; VI-NEXT: .LBB8_2: ; %exit -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -670,7 +670,7 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: s_uaddo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_cmp_eq_u32 s0, s1 @@ -680,12 +680,12 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1 ; GFX9-NEXT: .LBB8_2: ; %exit -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] -; GFX9-NEXT: global_store_byte v1, v2, s[6:7] +; GFX9-NEXT: global_store_dword v1, v0, s[8:9] +; GFX9-NEXT: global_store_byte v1, v2, s[10:11] ; GFX9-NEXT: s_endpgm entry: %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) @@ -708,7 +708,7 @@ exit: define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s14, s2 @@ -742,7 +742,7 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_uaddo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24 ; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -769,12 +769,12 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_uaddo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v2, v3 @@ -782,9 +782,9 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 ; GFX9-NEXT: .LBB9_2: ; %exit -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 735956caa72da4..7c310477dd838f 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -44,7 +44,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -80,7 +80,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GCN-LABEL: udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -112,7 +112,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX1030-LABEL: udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -185,7 +185,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: s_udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -218,7 +218,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; VI-LABEL: s_udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -251,7 +251,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GCN-LABEL: s_udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_sub_i32 s4, 0, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX1030-LABEL: s_udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX1030-NEXT: s_sub_i32 s5, 0, s3 @@ -346,7 +346,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -401,7 +401,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -456,7 +456,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -507,7 +507,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: udiv_v2i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -619,7 +619,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s6, s10 @@ -714,7 +714,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s6, s10 @@ -809,7 +809,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 16 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -904,7 +904,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: udiv_v4i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x1 @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_pow2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1116,7 +1116,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: udiv_i32_div_pow2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1134,7 +1134,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; GCN-LABEL: udiv_i32_div_pow2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1148,7 +1148,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; GFX1030-LABEL: udiv_i32_div_pow2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_k_even: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1203,7 +1203,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: udiv_i32_div_k_even: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; GCN-LABEL: udiv_i32_div_k_even: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1239,7 +1239,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; GFX1030-LABEL: udiv_i32_div_k_even: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1277,7 +1277,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_k_odd: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: udiv_i32_div_k_odd: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1317,7 +1317,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; GCN-LABEL: udiv_i32_div_k_odd: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; GFX1030-LABEL: udiv_i32_div_k_odd: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1400,7 +1400,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: v_udiv_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1429,7 +1429,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GCN-LABEL: v_udiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1452,7 +1452,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX1030-LABEL: v_udiv_i8: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1540,7 +1540,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1569,7 +1569,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i16: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1651,7 +1651,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1688,7 +1688,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1725,7 +1725,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i23: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1770,7 +1770,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i23: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x3 @@ -1848,7 +1848,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i24: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1885,7 +1885,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i24: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1922,7 +1922,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i24: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1967,7 +1967,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i24: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x3 @@ -2048,7 +2048,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) { ; SI-LABEL: scalarize_mulhu_4xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2076,7 +2076,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; VI-LABEL: scalarize_mulhu_4xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2104,7 +2104,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; GCN-LABEL: scalarize_mulhu_4xi32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -2130,7 +2130,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; GFX1030-LABEL: scalarize_mulhu_4xi32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -2193,7 +2193,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read define amdgpu_kernel void @test_udiv2(i32 %p) { ; SI-LABEL: test_udiv2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; VI-LABEL: test_udiv2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2217,7 +2217,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; GCN-LABEL: test_udiv2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -2227,7 +2227,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; GFX1030-LABEL: test_udiv2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX1030-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 ; GFX1030-NEXT: v_mov_b32_e32 v0, s0 @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; SI-LABEL: test_udiv_3_mulhu: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; VI-LABEL: test_udiv_3_mulhu: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -2279,7 +2279,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; GCN-LABEL: test_udiv_3_mulhu: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2290,7 +2290,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; GFX1030-LABEL: test_udiv_3_mulhu: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX1030-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0xaaaaaaab ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index a90454f50d198c..a77e3c226ad267 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -123,14 +123,14 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; ; GCN-IR-LABEL: s_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[4:5] +; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[6:7] ; GCN-IR-NEXT: s_flbit_i32_b64 s16, s[2:3] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GCN-IR-NEXT: s_sub_u32 s12, s10, s16 @@ -154,37 +154,37 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s14 -; GCN-IR-NEXT: s_add_u32 s14, s4, -1 -; GCN-IR-NEXT: s_addc_u32 s15, s5, -1 +; GCN-IR-NEXT: s_add_u32 s14, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s15, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] ; GCN-IR-NEXT: s_add_u32 s2, s2, s16 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: s_mov_b32 s7, 0 +; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s6, s14, s12 -; GCN-IR-NEXT: s_subb_u32 s6, s15, s13 -; GCN-IR-NEXT: s_ashr_i32 s10, s6, 31 +; GCN-IR-NEXT: s_sub_u32 s4, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s4, s15, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_and_b32 s6, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[4:5] +; GCN-IR-NEXT: s_and_b32 s4, s10, 1 +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[6:7] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 ; GCN-IR-NEXT: s_add_u32 s2, s2, 1 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[2:3], 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 @@ -398,17 +398,17 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s4, 8 +; GCN-NEXT: s_lshr_b32 s2, s6, 8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: s_lshr_b32 s2, s3, 8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 @@ -423,17 +423,17 @@ define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_udiv24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s4, 8 +; GCN-IR-NEXT: s_lshr_b32 s2, s6, 8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 @@ -497,7 +497,7 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv32_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s8, s[2:3], 0xe +; GCN-NEXT: s_load_dword s8, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -507,7 +507,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 @@ -533,7 +533,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_udiv32_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s8, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s8, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -543,7 +543,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 @@ -576,7 +576,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv31_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -587,7 +587,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 1 @@ -614,7 +614,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_udiv31_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -625,7 +625,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 @@ -659,17 +659,17 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv23_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s4, 9 +; GCN-NEXT: s_lshr_b32 s2, s6, 9 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: s_lshr_b32 s2, s3, 9 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 @@ -684,17 +684,17 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_udiv23_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s4, 9 +; GCN-IR-NEXT: s_lshr_b32 s2, s6, 9 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 9 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 @@ -716,8 +716,8 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_udiv24_i48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -831,21 +831,21 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s1, s5, 0xffff -; GCN-IR-NEXT: s_and_b32 s0, s4, 0xff000000 -; GCN-IR-NEXT: s_and_b32 s5, s7, 0xffff -; GCN-IR-NEXT: s_and_b32 s4, s6, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-IR-NEXT: s_and_b32 s0, s0, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-IR-NEXT: s_and_b32 s2, s2, 0xff000000 ; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[0:1], 24 -; GCN-IR-NEXT: s_lshr_b64 s[0:1], s[4:5], 24 +; GCN-IR-NEXT: s_lshr_b64 s[0:1], s[2:3], 24 ; GCN-IR-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[0:1], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[0:1], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 ; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[0:1] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[6:7], s[2:3], s[6:7] ; GCN-IR-NEXT: s_flbit_i32_b64 s16, s[8:9] ; GCN-IR-NEXT: s_sub_u32 s12, s10, s16 ; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 @@ -856,7 +856,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_cselect_b32 s7, 0, s9 ; GCN-IR-NEXT: s_cselect_b32 s6, 0, s8 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19] -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 +; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 @@ -871,37 +871,37 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s14 ; GCN-IR-NEXT: s_add_u32 s14, s0, -1 ; GCN-IR-NEXT: s_addc_u32 s15, s1, -1 -; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11] -; GCN-IR-NEXT: s_add_u32 s8, s4, s16 -; GCN-IR-NEXT: s_addc_u32 s9, s5, 0 +; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] +; GCN-IR-NEXT: s_add_u32 s8, s2, s16 +; GCN-IR-NEXT: s_addc_u32 s9, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31 +; GCN-IR-NEXT: s_lshr_b32 s2, s7, 31 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s4, s14, s12 -; GCN-IR-NEXT: s_subb_u32 s4, s15, s13 -; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31 +; GCN-IR-NEXT: s_sub_u32 s2, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s2, s15, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s2, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_and_b32 s4, s10, 1 +; GCN-IR-NEXT: s_and_b32 s2, s10, 1 ; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 ; GCN-IR-NEXT: .LBB7_4: ; %Flow4 ; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[0:1] +; GCN-IR-NEXT: s_or_b64 s[6:7], s[2:3], s[0:1] ; GCN-IR-NEXT: .LBB7_5: ; %udiv-end -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 @@ -920,7 +920,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1025,7 +1025,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_udiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] @@ -1364,7 +1364,7 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0xaaaaaaab ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaaa ; GCN-NEXT: s_mov_b32 s7, 0xf000 @@ -1393,7 +1393,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_udiv_k_den_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] ; GCN-IR-NEXT: s_sub_u32 s8, 59, s12 @@ -1563,7 +1563,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 8 @@ -1584,7 +1584,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_udiv24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 @@ -1611,7 +1611,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_udiv24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @s_test_udiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_udiv24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index 1468c7b99b5c25..a56346f3bb45bc 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -36,22 +36,22 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; ; GFX6-LABEL: test_udivrem: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s8, s[2:3], 0x26 -; GFX6-NEXT: s_load_dword s9, s[2:3], 0x1d +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x26 +; GFX6-NEXT: s_load_dword s9, s[4:5], 0x1d +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s6, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: s_mov_b32 s7, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: v_readfirstlane_b32 s10, v0 @@ -78,37 +78,37 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; ; GFX8-LABEL: test_udivrem: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x98 -; GFX8-NEXT: s_load_dword s5, s[2:3], 0x74 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x98 +; GFX8-NEXT: s_load_dword s7, s[4:5], 0x74 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX8-NEXT: s_sub_i32 s0, 0, s4 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX8-NEXT: s_sub_i32 s0, 0, s6 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v4, s5, v0 +; GFX8-NEXT: v_mul_hi_u32 v4, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v4 -; GFX8-NEXT: s_mul_i32 s0, s0, s4 -; GFX8-NEXT: s_sub_i32 s0, s5, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s4 +; GFX8-NEXT: s_mul_i32 s0, s0, s6 +; GFX8-NEXT: s_sub_i32 s0, s7, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s6 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v4 -; GFX8-NEXT: s_cmp_ge_u32 s0, s4 +; GFX8-NEXT: s_cmp_ge_u32 s0, s6 ; GFX8-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX8-NEXT: s_cselect_b32 s0, s1, s0 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX8-NEXT: s_sub_i32 s1, s0, s4 +; GFX8-NEXT: s_sub_i32 s1, s0, s6 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v4 -; GFX8-NEXT: s_cmp_ge_u32 s0, s4 +; GFX8-NEXT: s_cmp_ge_u32 s0, s6 ; GFX8-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GFX8-NEXT: s_cselect_b32 s0, s1, s0 @@ -164,99 +164,97 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; ; GFX6-LABEL: test_udivrem_v2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s6, 0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s6 -; GFX6-NEXT: s_sub_i32 s0, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s6 -; GFX6-NEXT: s_cmp_ge_u32 s0, s6 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s6 -; GFX6-NEXT: s_cmp_ge_u32 s0, s6 -; GFX6-NEXT: s_cselect_b32 s4, s1, s0 -; GFX6-NEXT: s_sub_i32 s0, 0, s7 -; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_readfirstlane_b32 s6, v0 +; GFX6-NEXT: s_mul_i32 s6, s6, s2 +; GFX6-NEXT: s_sub_i32 s0, s0, s6 +; GFX6-NEXT: s_sub_i32 s6, s0, s2 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s6, s0 +; GFX6-NEXT: s_sub_i32 s6, s0, s2 +; GFX6-NEXT: s_cmp_ge_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s6, s0 +; GFX6-NEXT: s_sub_i32 s2, 0, s3 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX6-NEXT: v_readfirstlane_b32 s6, v0 -; GFX6-NEXT: s_mul_i32 s6, s6, s7 -; GFX6-NEXT: s_sub_i32 s5, s5, s6 -; GFX6-NEXT: s_sub_i32 s6, s5, s7 -; GFX6-NEXT: s_cmp_ge_u32 s5, s7 -; GFX6-NEXT: s_cselect_b32 s5, s6, s5 -; GFX6-NEXT: s_sub_i32 s6, s5, s7 -; GFX6-NEXT: s_cmp_ge_u32 s5, s7 -; GFX6-NEXT: s_cselect_b32 s5, s6, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: s_sub_i32 s2, s1, s3 +; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_sub_i32 s2, s1, s3 +; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem_v2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX8-NEXT: s_sub_i32 s0, 0, s6 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s7 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX8-NEXT: s_sub_i32 s6, 0, s2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: s_mul_i32 s0, s0, s6 -; GFX8-NEXT: s_sub_i32 s0, s4, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s6 -; GFX8-NEXT: s_cmp_ge_u32 s0, s6 -; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s6 -; GFX8-NEXT: s_cmp_ge_u32 s0, s6 -; GFX8-NEXT: s_cselect_b32 s4, s1, s0 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_readfirstlane_b32 s6, v0 +; GFX8-NEXT: s_mul_i32 s6, s6, s2 +; GFX8-NEXT: s_sub_i32 s0, s0, s6 +; GFX8-NEXT: s_sub_i32 s6, s0, s2 +; GFX8-NEXT: s_cmp_ge_u32 s0, s2 +; GFX8-NEXT: s_cselect_b32 s0, s6, s0 +; GFX8-NEXT: s_sub_i32 s6, s0, s2 +; GFX8-NEXT: s_cmp_ge_u32 s0, s2 +; GFX8-NEXT: s_cselect_b32 s0, s6, s0 +; GFX8-NEXT: s_sub_i32 s2, 0, s3 +; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_readfirstlane_b32 s2, v1 -; GFX8-NEXT: s_mul_i32 s2, s2, s7 -; GFX8-NEXT: s_sub_i32 s2, s5, s2 -; GFX8-NEXT: s_sub_i32 s3, s2, s7 -; GFX8-NEXT: s_cmp_ge_u32 s2, s7 -; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: s_sub_i32 s3, s2, s7 -; GFX8-NEXT: s_cmp_ge_u32 s2, s7 -; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mul_hi_u32 v1, s1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: s_mul_i32 s0, s0, s3 +; GFX8-NEXT: s_sub_i32 s0, s1, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s3 +; GFX8-NEXT: s_cmp_ge_u32 s0, s3 +; GFX8-NEXT: s_cselect_b32 s0, s1, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s3 +; GFX8-NEXT: s_cmp_ge_u32 s0, s3 +; GFX8-NEXT: s_cselect_b32 s0, s1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm %result0 = udiv <2 x i32> %x, %y @@ -335,14 +333,13 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; ; GFX6-LABEL: test_udivrem_v4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX6-NEXT: s_sub_i32 s0, 0, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -350,82 +347,84 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s14 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s8 -; GFX6-NEXT: s_sub_i32 s0, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_mul_i32 s0, s0, s12 +; GFX6-NEXT: s_sub_i32 s0, s8, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s12 +; GFX6-NEXT: s_cmp_ge_u32 s0, s12 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v0, s1, v1 +; GFX6-NEXT: s_sub_i32 s1, s0, s12 +; GFX6-NEXT: s_cmp_ge_u32 s0, s12 +; GFX6-NEXT: s_cselect_b32 s6, s1, s0 +; GFX6-NEXT: s_sub_i32 s0, 0, s13 +; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s9 -; GFX6-NEXT: s_sub_i32 s1, s5, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, 0, s10 -; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s15 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s13 +; GFX6-NEXT: s_sub_i32 s0, s9, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s13 +; GFX6-NEXT: s_cmp_ge_u32 s0, s13 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s13 +; GFX6-NEXT: s_cmp_ge_u32 s0, s13 +; GFX6-NEXT: s_cselect_b32 s7, s1, s0 +; GFX6-NEXT: s_sub_i32 s0, 0, s14 +; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s10 -; GFX6-NEXT: s_sub_i32 s4, s6, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, 0, s11 -; GFX6-NEXT: v_mul_lo_u32 v0, s5, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s14 +; GFX6-NEXT: s_sub_i32 s0, s10, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s14 +; GFX6-NEXT: s_cmp_ge_u32 s0, s14 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s14 +; GFX6-NEXT: s_cmp_ge_u32 s0, s14 +; GFX6-NEXT: s_cselect_b32 s8, s1, s0 +; GFX6-NEXT: s_sub_i32 s0, 0, s15 +; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v2, s7, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v2 -; GFX6-NEXT: s_mul_i32 s0, s0, s11 -; GFX6-NEXT: s_sub_i32 s0, s7, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: v_mul_hi_u32 v2, s11, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: s_mul_i32 s4, s4, s15 +; GFX6-NEXT: s_sub_i32 s4, s11, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s15 +; GFX6-NEXT: s_cmp_ge_u32 s4, s15 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s15 +; GFX6-NEXT: s_cmp_ge_u32 s4, s15 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem_v4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX8-NEXT: s_sub_i32 s0, 0, s12 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -433,74 +432,74 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s14 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: s_mul_i32 s0, s0, s8 -; GFX8-NEXT: s_sub_i32 s0, s4, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s8 -; GFX8-NEXT: s_cmp_ge_u32 s0, s8 +; GFX8-NEXT: s_mul_i32 s0, s0, s12 +; GFX8-NEXT: s_sub_i32 s0, s8, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s12 +; GFX8-NEXT: s_cmp_ge_u32 s0, s12 ; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s8 -; GFX8-NEXT: s_cmp_ge_u32 s0, s8 -; GFX8-NEXT: s_cselect_b32 s4, s1, s0 -; GFX8-NEXT: s_sub_i32 s0, 0, s9 +; GFX8-NEXT: s_sub_i32 s1, s0, s12 +; GFX8-NEXT: s_cmp_ge_u32 s0, s12 +; GFX8-NEXT: s_cselect_b32 s2, s1, s0 +; GFX8-NEXT: s_sub_i32 s0, 0, s13 ; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: s_mul_i32 s0, s0, s9 -; GFX8-NEXT: s_sub_i32 s0, s5, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s9 -; GFX8-NEXT: s_cmp_ge_u32 s0, s9 +; GFX8-NEXT: s_mul_i32 s0, s0, s13 +; GFX8-NEXT: s_sub_i32 s0, s9, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s13 +; GFX8-NEXT: s_cmp_ge_u32 s0, s13 ; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s9 -; GFX8-NEXT: s_cmp_ge_u32 s0, s9 -; GFX8-NEXT: s_cselect_b32 s5, s1, s0 -; GFX8-NEXT: s_sub_i32 s0, 0, s10 +; GFX8-NEXT: s_sub_i32 s1, s0, s13 +; GFX8-NEXT: s_cmp_ge_u32 s0, s13 +; GFX8-NEXT: s_cselect_b32 s3, s1, s0 +; GFX8-NEXT: s_sub_i32 s0, 0, s14 ; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s10, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: s_mul_i32 s0, s0, s10 -; GFX8-NEXT: s_sub_i32 s0, s6, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s10 -; GFX8-NEXT: s_cmp_ge_u32 s0, s10 +; GFX8-NEXT: s_mul_i32 s0, s0, s14 +; GFX8-NEXT: s_sub_i32 s0, s10, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s14 +; GFX8-NEXT: s_cmp_ge_u32 s0, s14 ; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s10 -; GFX8-NEXT: s_cmp_ge_u32 s0, s10 +; GFX8-NEXT: s_sub_i32 s1, s0, s14 +; GFX8-NEXT: s_cmp_ge_u32 s0, s14 ; GFX8-NEXT: s_cselect_b32 s6, s1, s0 -; GFX8-NEXT: s_sub_i32 s0, 0, s11 +; GFX8-NEXT: s_sub_i32 s0, 0, s15 ; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; GFX8-NEXT: v_mul_hi_u32 v3, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mul_hi_u32 v3, s11, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_readfirstlane_b32 s2, v3 -; GFX8-NEXT: s_mul_i32 s2, s2, s11 -; GFX8-NEXT: s_sub_i32 s2, s7, s2 -; GFX8-NEXT: s_sub_i32 s3, s2, s11 -; GFX8-NEXT: s_cmp_ge_u32 s2, s11 +; GFX8-NEXT: s_mul_i32 s2, s2, s15 +; GFX8-NEXT: s_sub_i32 s2, s11, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s15 +; GFX8-NEXT: s_cmp_ge_u32 s2, s15 ; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: s_sub_i32 s3, s2, s11 -; GFX8-NEXT: s_cmp_ge_u32 s2, s11 +; GFX8-NEXT: s_sub_i32 s3, s2, s15 +; GFX8-NEXT: s_cmp_ge_u32 s2, s15 ; GFX8-NEXT: s_cselect_b32 s2, s3, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index d00ea6dff24474..55cbc14a467068 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_uint_to_fp_i64_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -26,7 +26,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_uint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -53,7 +53,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_uint_to_fp_i64_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -66,7 +66,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; ; VI-LABEL: s_uint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -84,8 +84,8 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 x i64> %in) { ; SI-LABEL: s_uint_to_fp_v2i64_to_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 @@ -102,12 +102,12 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; ; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; VI-NEXT: v_ldexp_f64 v[4:5], v[2:3], 32 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -126,63 +126,63 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) { ; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8 +; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s11 -; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s9 -; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s10 -; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s8 -; SI-NEXT: v_cvt_f64_u32_e32 v[8:9], s15 -; SI-NEXT: v_cvt_f64_u32_e32 v[10:11], s13 +; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s1 +; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 +; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s0 +; SI-NEXT: v_cvt_f64_u32_e32 v[8:9], s7 +; SI-NEXT: v_cvt_f64_u32_e32 v[10:11], s5 ; SI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; SI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 32 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; SI-NEXT: v_add_f64 v[0:1], v[4:5], v[6:7] -; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s14 -; SI-NEXT: v_cvt_f64_u32_e32 v[12:13], s12 +; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s6 +; SI-NEXT: v_cvt_f64_u32_e32 v[12:13], s4 ; SI-NEXT: v_ldexp_f64 v[6:7], v[8:9], 32 ; SI-NEXT: v_ldexp_f64 v[8:9], v[10:11], 32 -; SI-NEXT: s_add_u32 s2, s0, 16 -; SI-NEXT: s_addc_u32 s3, s1, 0 +; SI-NEXT: s_add_u32 s0, s8, 16 +; SI-NEXT: s_addc_u32 s1, s9, 0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] ; SI-NEXT: v_add_f64 v[4:5], v[8:9], v[12:13] -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: v_mov_b32_e32 v8, s2 +; SI-NEXT: v_mov_b32_e32 v9, s1 +; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; SI-NEXT: s_nop 0 -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uint_to_fp_v4i64_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s15 -; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s13 -; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s11 -; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s9 +; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s7 +; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s5 +; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s1 ; VI-NEXT: v_ldexp_f64 v[8:9], v[2:3], 32 ; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 32 ; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; VI-NEXT: v_ldexp_f64 v[10:11], v[6:7], 32 -; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s14 -; VI-NEXT: v_cvt_f64_u32_e32 v[12:13], s12 -; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s10 -; VI-NEXT: v_cvt_f64_u32_e32 v[14:15], s8 +; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s6 +; VI-NEXT: v_cvt_f64_u32_e32 v[12:13], s4 +; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 +; VI-NEXT: v_cvt_f64_u32_e32 v[14:15], s0 ; VI-NEXT: v_add_f64 v[6:7], v[8:9], v[6:7] ; VI-NEXT: v_add_f64 v[4:5], v[4:5], v[12:13] ; VI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; VI-NEXT: v_add_f64 v[0:1], v[10:11], v[14:15] -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v11, s3 -; VI-NEXT: v_mov_b32_e32 v9, s1 -; VI-NEXT: v_mov_b32_e32 v10, s2 -; VI-NEXT: v_mov_b32_e32 v8, s0 +; VI-NEXT: s_add_u32 s0, s8, 16 +; VI-NEXT: s_addc_u32 s1, s9, 0 +; VI-NEXT: v_mov_b32_e32 v11, s1 +; VI-NEXT: v_mov_b32_e32 v8, s8 +; VI-NEXT: v_mov_b32_e32 v10, s0 +; VI-NEXT: v_mov_b32_e32 v9, s9 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm @@ -194,8 +194,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_uint_to_fp_i32_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; SI-NEXT: v_mov_b32_e32 v3, s1 @@ -205,8 +205,8 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; ; VI-LABEL: s_uint_to_fp_i32_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -221,7 +221,7 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -237,8 +237,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 x i32> %in) { ; SI-LABEL: s_uint_to_fp_v4i32_to_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -257,8 +257,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; ; VI-LABEL: s_uint_to_fp_v4i32_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -284,8 +284,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: uint_to_fp_i1_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -298,8 +298,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: uint_to_fp_i1_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -318,8 +318,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %in) { ; SI-LABEL: uint_to_fp_i1_to_f64_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s2, 0 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -332,8 +332,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; ; VI-LABEL: uint_to_fp_i1_to_f64_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -351,8 +351,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) { ; SI-LABEL: s_uint_to_fp_i8_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xff ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -363,8 +363,8 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; ; VI-LABEL: s_uint_to_fp_i8_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -400,8 +400,8 @@ define double @v_uint_to_fp_i8_to_f64(i8 %in) { define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_select_uint_to_fp_i1_vals_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -414,8 +414,8 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_uint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -451,8 +451,8 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_select_uint_to_fp_i1_vals_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -465,8 +465,8 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_uint_to_fp_i1_vals_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -503,8 +503,8 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[8:9], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -517,8 +517,8 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; ; VI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll index 764f99fb6833fe..8e210b88969548 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_uint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_uint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s4, s3 ; GFX8-NEXT: s_min_u32 s4, s4, 32 @@ -46,7 +46,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_uint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s4, s3 @@ -73,7 +73,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -98,7 +98,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_uint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -124,7 +124,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_uint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 @@ -158,7 +158,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_uint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -177,7 +177,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_uint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s4, s3 ; GFX8-NEXT: s_min_u32 s4, s4, 32 @@ -194,7 +194,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_uint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s4, s3 @@ -219,7 +219,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -243,7 +243,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_uint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -268,7 +268,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_uint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 @@ -301,81 +301,81 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_flbit_i32_b32 s8, s7 -; GFX6-NEXT: s_flbit_i32_b32 s9, s5 +; GFX6-NEXT: s_flbit_i32_b32 s8, s3 +; GFX6-NEXT: s_flbit_i32_b32 s9, s1 ; GFX6-NEXT: s_min_u32 s8, s8, 32 ; GFX6-NEXT: s_min_u32 s9, s9, 32 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 ; GFX6-NEXT: s_sub_i32 s8, 32, s8 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 ; GFX6-NEXT: s_sub_i32 s9, 32, s9 -; GFX6-NEXT: s_min_u32 s6, s6, 1 -; GFX6-NEXT: s_min_u32 s4, s4, 1 -; GFX6-NEXT: s_or_b32 s6, s7, s6 -; GFX6-NEXT: s_or_b32 s4, s5, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX6-NEXT: s_min_u32 s2, s2, 1 +; GFX6-NEXT: s_min_u32 s0, s0, 1 +; GFX6-NEXT: s_or_b32 s2, s3, s2 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s8 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s9 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_flbit_i32_b32 s2, s7 -; GFX8-NEXT: s_flbit_i32_b32 s3, s5 -; GFX8-NEXT: s_min_u32 s8, s2, 32 -; GFX8-NEXT: s_min_u32 s9, s3, 32 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8 +; GFX8-NEXT: s_flbit_i32_b32 s6, s3 +; GFX8-NEXT: s_flbit_i32_b32 s7, s1 +; GFX8-NEXT: s_min_u32 s6, s6, 32 +; GFX8-NEXT: s_min_u32 s7, s7, 32 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 ; GFX8-NEXT: s_min_u32 s2, s2, 1 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 ; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_min_u32 s0, s0, 1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX8-NEXT: s_min_u32 s2, s4, 1 -; GFX8-NEXT: s_or_b32 s2, s5, s2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GFX8-NEXT: s_sub_i32 s2, 32, s8 -; GFX8-NEXT: v_ldexp_f32 v1, v0, s2 -; GFX8-NEXT: s_sub_i32 s2, 32, s9 -; GFX8-NEXT: v_ldexp_f32 v0, v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX8-NEXT: s_sub_i32 s0, 32, s6 +; GFX8-NEXT: v_ldexp_f32 v1, v0, s0 +; GFX8-NEXT: s_sub_i32 s0, 32, s7 +; GFX8-NEXT: v_ldexp_f32 v0, v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u32 s2, s7 -; GFX11-NEXT: s_clz_i32_u32 s3, s5 -; GFX11-NEXT: s_min_u32 s8, s2, 32 -; GFX11-NEXT: s_min_u32 s9, s3, 32 -; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s8 -; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GFX11-NEXT: s_clz_i32_u32 s6, s3 +; GFX11-NEXT: s_clz_i32_u32 s7, s1 +; GFX11-NEXT: s_min_u32 s6, s6, 32 +; GFX11-NEXT: s_min_u32 s7, s7, 32 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 ; GFX11-NEXT: s_min_u32 s2, s2, 1 -; GFX11-NEXT: s_min_u32 s4, s4, 1 +; GFX11-NEXT: s_min_u32 s0, s0, 1 ; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_or_b32 s0, s1, s0 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s3 -; GFX11-NEXT: s_sub_i32 s2, 32, s8 -; GFX11-NEXT: s_sub_i32 s3, 32, s9 +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX11-NEXT: s_sub_i32 s0, 32, s6 +; GFX11-NEXT: s_sub_i32 s1, 32, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f32 v1, v0, s2 -; GFX11-NEXT: v_ldexp_f32 v0, v2, s3 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: v_ldexp_f32 v1, v0, s0 +; GFX11-NEXT: v_ldexp_f32 v0, v2, s1 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm %result = uitofp <2 x i64> %in to <2 x float> store <2 x float> %result, ptr addrspace(1) %out @@ -385,7 +385,7 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_v4i64_to_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -436,7 +436,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -489,7 +489,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 @@ -549,27 +549,27 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_flbit_i32_b32 s8, s7 -; GFX6-NEXT: s_flbit_i32_b32 s9, s5 -; GFX6-NEXT: s_min_u32 s8, s8, 32 -; GFX6-NEXT: s_min_u32 s9, s9, 32 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; GFX6-NEXT: s_sub_i32 s8, 32, s8 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 -; GFX6-NEXT: s_sub_i32 s9, 32, s9 -; GFX6-NEXT: s_min_u32 s6, s6, 1 +; GFX6-NEXT: s_flbit_i32_b32 s4, s11 +; GFX6-NEXT: s_flbit_i32_b32 s5, s9 +; GFX6-NEXT: s_min_u32 s6, s4, 32 +; GFX6-NEXT: s_min_u32 s12, s5, 32 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[10:11], s6 +; GFX6-NEXT: s_sub_i32 s10, 32, s6 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s12 +; GFX6-NEXT: s_sub_i32 s8, 32, s12 ; GFX6-NEXT: s_min_u32 s4, s4, 1 -; GFX6-NEXT: s_or_b32 s6, s7, s6 +; GFX6-NEXT: s_min_u32 s6, s6, 1 ; GFX6-NEXT: s_or_b32 s4, s5, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s8 -; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s9 +; GFX6-NEXT: s_or_b32 s5, s7, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s10 +; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s8 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -579,63 +579,63 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_flbit_i32_b32 s2, s7 -; GFX8-NEXT: s_flbit_i32_b32 s3, s5 -; GFX8-NEXT: s_min_u32 s8, s2, 32 -; GFX8-NEXT: s_min_u32 s9, s3, 32 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8 +; GFX8-NEXT: s_flbit_i32_b32 s6, s3 +; GFX8-NEXT: s_flbit_i32_b32 s7, s1 +; GFX8-NEXT: s_min_u32 s6, s6, 32 +; GFX8-NEXT: s_min_u32 s7, s7, 32 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 ; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_min_u32 s0, s0, 1 ; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s9 -; GFX8-NEXT: s_min_u32 s2, s2, 1 -; GFX8-NEXT: s_or_b32 s2, s3, s2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX8-NEXT: s_sub_i32 s6, 32, s8 -; GFX8-NEXT: s_sub_i32 s2, 32, s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX8-NEXT: s_sub_i32 s6, 32, s6 +; GFX8-NEXT: s_sub_i32 s0, 32, s7 ; GFX8-NEXT: v_ldexp_f32 v0, v0, s6 -; GFX8-NEXT: v_ldexp_f32 v1, v1, s2 +; GFX8-NEXT: v_ldexp_f32 v1, v1, s0 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u32 s2, s7 -; GFX11-NEXT: s_clz_i32_u32 s3, s5 -; GFX11-NEXT: s_min_u32 s8, s2, 32 -; GFX11-NEXT: s_min_u32 s9, s3, 32 -; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s8 -; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GFX11-NEXT: s_clz_i32_u32 s6, s3 +; GFX11-NEXT: s_clz_i32_u32 s7, s1 +; GFX11-NEXT: s_min_u32 s6, s6, 32 +; GFX11-NEXT: s_min_u32 s7, s7, 32 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 ; GFX11-NEXT: s_min_u32 s2, s2, 1 -; GFX11-NEXT: s_min_u32 s4, s4, 1 +; GFX11-NEXT: s_min_u32 s0, s0, 1 ; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_or_b32 s0, s1, s0 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX11-NEXT: s_sub_i32 s2, 32, s8 -; GFX11-NEXT: s_sub_i32 s3, 32, s9 +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX11-NEXT: s_sub_i32 s0, 32, s6 +; GFX11-NEXT: s_sub_i32 s1, 32, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 -; GFX11-NEXT: v_ldexp_f32 v1, v1, s3 +; GFX11-NEXT: v_ldexp_f32 v0, v0, s0 +; GFX11-NEXT: v_ldexp_f32 v1, v1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-NEXT: s_endpgm %result = uitofp <2 x i64> %in to <2 x half> store <2 x half> %result, ptr addrspace(1) %out @@ -645,7 +645,7 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_v4i64_to_v4f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -704,7 +704,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -763,7 +763,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index 686dba7e53e62d..37d1116e9eccb6 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; SI-LABEL: uitofp_i16_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; ; VI-LABEL: uitofp_i16_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -44,7 +44,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; ; GFX11-TRUE16-LABEL: uitofp_i16_to_f16: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -62,7 +62,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; ; GFX11-FAKE16-LABEL: uitofp_i16_to_f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -89,7 +89,7 @@ entry: define amdgpu_kernel void @uitofp_i32_to_f16( ; SI-LABEL: uitofp_i32_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -108,7 +108,7 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; ; VI-LABEL: uitofp_i32_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -127,7 +127,7 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; ; GFX11-TRUE16-LABEL: uitofp_i32_to_f16: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -147,7 +147,7 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; ; GFX11-FAKE16-LABEL: uitofp_i32_to_f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -178,7 +178,7 @@ entry: define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; SI-LABEL: uitofp_v2i16_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -203,7 +203,7 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; ; VI-LABEL: uitofp_v2i16_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -223,7 +223,7 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; ; GFX11-TRUE16-LABEL: uitofp_v2i16_to_v2f16: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -248,7 +248,7 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; ; GFX11-FAKE16-LABEL: uitofp_v2i16_to_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -279,7 +279,7 @@ entry: define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; SI-LABEL: uitofp_v2i32_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -302,7 +302,7 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; ; VI-LABEL: uitofp_v2i32_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -324,7 +324,7 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; ; GFX11-TRUE16-LABEL: uitofp_v2i32_to_v2f16: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 @@ -351,7 +351,7 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; ; GFX11-FAKE16-LABEL: uitofp_v2i32_to_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -384,21 +384,19 @@ entry: define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_uint_to_fp_i1_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -406,26 +404,26 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uint_to_fp_i1_to_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s12, s10 +; VI-NEXT: s_mov_b32 s13, s11 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -433,27 +431,29 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s8 +; VI-NEXT: s_mov_b32 s1, s9 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: s_uint_to_fp_i1_to_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, s10 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, s11 -; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 -; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s7 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 -; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s8, s4 -; GFX11-TRUE16-NEXT: s_mov_b32 s9, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -463,27 +463,27 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: s_uint_to_fp_i1_to_f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-FAKE16-NEXT: s_mov_b32 s2, s10 -; GFX11-FAKE16-NEXT: s_mov_b32 s3, s11 -; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 -; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s7 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_mov_b32 s12, s6 -; GFX11-FAKE16-NEXT: s_mov_b32 s13, s7 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s11 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-FAKE16-NEXT: s_mov_b32 s8, s4 -; GFX11-FAKE16-NEXT: s_mov_b32 s9, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -493,7 +493,7 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll index a3fc6ded0a0047..729dbab1906f45 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @uniform_if_scc(i32 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s0, 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -13,7 +13,7 @@ define amdgpu_kernel void @uniform_if_scc(i32 %cond, ptr addrspace(1) %out) { ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s0, 1 ; SI-NEXT: .LBB0_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @uniform_if_scc(i32 %cond, ptr addrspace(1) %out) { ; ; VI-LABEL: uniform_if_scc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s0, 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @uniform_if_scc(i32 %cond, ptr addrspace(1) %out) { ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: .LBB0_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -57,16 +57,16 @@ done: define amdgpu_kernel void @uniform_if_vcc(float %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_vcc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 +; SI-NEXT: s_load_dword s1, s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], s1, 0 -; SI-NEXT: s_and_b64 vcc, exec, s[4:5] +; SI-NEXT: v_cmp_eq_f32_e64 s[2:3], s1, 0 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccnz .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s0, 1 ; SI-NEXT: .LBB1_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -76,16 +76,16 @@ define amdgpu_kernel void @uniform_if_vcc(float %cond, ptr addrspace(1) %out) { ; ; VI-LABEL: uniform_if_vcc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s1, s[2:3], 0x24 +; VI-NEXT: s_load_dword s1, s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], s1, 0 -; VI-NEXT: s_and_b64 vcc, exec, s[4:5] +; VI-NEXT: v_cmp_eq_f32_e64 s[2:3], s1, 0 +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccnz .LBB1_2 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: .LBB1_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -111,7 +111,7 @@ done: define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_swap_br_targets_scc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -119,7 +119,7 @@ define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, ptr addrspa ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s0, 1 ; SI-NEXT: .LBB2_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -129,7 +129,7 @@ define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, ptr addrspa ; ; VI-LABEL: uniform_if_swap_br_targets_scc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -137,7 +137,7 @@ define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, ptr addrspa ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: .LBB2_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -163,16 +163,16 @@ done: define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_swap_br_targets_vcc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 +; SI-NEXT: s_load_dword s1, s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_neq_f32_e64 s[4:5], s1, 0 -; SI-NEXT: s_and_b64 vcc, exec, s[4:5] +; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], s1, 0 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccnz .LBB3_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s0, 1 ; SI-NEXT: .LBB3_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -182,16 +182,16 @@ define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, ptr addrs ; ; VI-LABEL: uniform_if_swap_br_targets_vcc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s1, s[2:3], 0x24 +; VI-NEXT: s_load_dword s1, s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], s1, 0 -; VI-NEXT: s_and_b64 vcc, exec, s[4:5] +; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], s1, 0 +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccnz .LBB3_2 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: .LBB3_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -219,14 +219,14 @@ done: define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a) { ; SI-LABEL: uniform_if_move_valu: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, s0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 ; SI-NEXT: s_cbranch_vccnz .LBB4_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -237,14 +237,14 @@ define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a) ; ; VI-LABEL: uniform_if_move_valu: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s0, s[4:5], 0x2c ; VI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, s0, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 ; VI-NEXT: s_cbranch_vccnz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -271,14 +271,14 @@ endif: define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, float %a) { ; SI-LABEL: uniform_if_move_valu_commute: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, s0, v0 ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0 ; SI-NEXT: s_cbranch_vccnz .LBB5_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -289,14 +289,14 @@ define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, f ; ; VI-LABEL: uniform_if_move_valu_commute: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s0, s[4:5], 0x2c ; VI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, s0, v0 ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0 ; VI-NEXT: s_cbranch_vccnz .LBB5_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -322,36 +322,38 @@ endif: define amdgpu_kernel void @uniform_if_else_ret(ptr addrspace(1) nocapture %out, i32 %a) { ; SI-LABEL: uniform_if_else_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB6_2 ; SI-NEXT: ; %bb.1: ; %if.else +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB6_2: ; %if.then +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_else_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB6_2 ; VI-NEXT: ; %bb.1: ; %if.else +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB6_2: ; %if.then +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -374,19 +376,20 @@ if.end: ; preds = %if.else, %if.then define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr addrspace(1) nocapture %out1, i32 %a) { ; SI-LABEL: uniform_if_else: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_2 ; SI-NEXT: ; %bb.1: ; %if.else +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, 2 ; SI-NEXT: s_branch .LBB7_3 ; SI-NEXT: .LBB7_2: ; %if.then +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -401,19 +404,20 @@ define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr ; ; VI-LABEL: uniform_if_else: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: s_cbranch_scc0 .LBB7_2 ; VI-NEXT: ; %bb.1: ; %if.else +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, 2 ; VI-NEXT: s_branch .LBB7_3 ; VI-NEXT: .LBB7_2: ; %if.then +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -444,14 +448,14 @@ if.end: ; preds = %if.else, %if.then define amdgpu_kernel void @icmp_2_users(ptr addrspace(1) %out, i32 %cond) { ; SI-LABEL: icmp_2_users: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_gt_i32 s4, 0 +; SI-NEXT: s_cmp_gt_i32 s2, 0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_cmp_lt_i32 s4, 1 +; SI-NEXT: s_cmp_lt_i32 s2, 1 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %IF -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] @@ -462,14 +466,14 @@ define amdgpu_kernel void @icmp_2_users(ptr addrspace(1) %out, i32 %cond) { ; ; VI-LABEL: icmp_2_users: ; VI: ; %bb.0: ; %main_body -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_gt_i32 s4, 0 +; VI-NEXT: s_cmp_gt_i32 s2, 0 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lt_i32 s4, 1 +; VI-NEXT: s_cmp_lt_i32 s2, 1 ; VI-NEXT: s_cbranch_scc1 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %IF -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] @@ -493,7 +497,7 @@ ENDIF: ; preds = %IF, %main_body define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, ptr addrspace(1) %out) { ; SI-LABEL: icmp_users_different_blocks: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lt_i32 s0, 1 ; SI-NEXT: s_cbranch_scc1 .LBB9_2 @@ -506,7 +510,7 @@ define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, p ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB9_3: ; %bb7 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -516,7 +520,7 @@ define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, p ; ; VI-LABEL: icmp_users_different_blocks: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lt_i32 s0, 1 ; VI-NEXT: s_cbranch_scc1 .LBB9_2 @@ -529,7 +533,7 @@ define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, p ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB9_3: ; %bb7 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -558,7 +562,7 @@ bb9: ; preds = %bb8, %bb4 define amdgpu_kernel void @uniform_loop(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: uniform_loop: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: .LBB10_1: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -570,7 +574,7 @@ define amdgpu_kernel void @uniform_loop(ptr addrspace(1) %out, i32 %a) { ; ; VI-LABEL: uniform_loop: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s0, s[4:5], 0x2c ; VI-NEXT: .LBB10_1: ; %loop ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -601,13 +605,13 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc ; SI-NEXT: s_cbranch_execz .LBB11_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_cbranch_scc0 .LBB11_3 ; SI-NEXT: .LBB11_2: ; %endif @@ -624,13 +628,13 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc ; VI-NEXT: s_cbranch_execz .LBB11_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_cbranch_scc0 .LBB11_3 ; VI-NEXT: .LBB11_2: ; %endif @@ -660,14 +664,14 @@ endif: define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 %cond) { ; SI-LABEL: divergent_inside_uniform: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB12_2 ; SI-NEXT: .LBB12_1: ; %endif ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB12_2: ; %if -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -683,14 +687,14 @@ define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 % ; ; VI-LABEL: divergent_inside_uniform: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s0, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB12_2 ; VI-NEXT: .LBB12_1: ; %endif ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB12_2: ; %if -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -724,57 +728,57 @@ endif: define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %cond) { ; SI-LABEL: divergent_if_uniform_if: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SI-NEXT: s_cbranch_execz .LBB13_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: .LBB13_2: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.3: ; %exit ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB13_4: ; %if_uniform -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, 2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: divergent_if_uniform_if: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; VI-NEXT: s_and_saveexec_b64 s[6:7], vcc ; VI-NEXT: s_cbranch_execz .LBB13_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: .LBB13_2: ; %endif -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB13_4 ; VI-NEXT: ; %bb.3: ; %exit ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB13_4: ; %if_uniform -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 2 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -805,12 +809,12 @@ exit: define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, ptr addrspace(1) %out) { ; SI-LABEL: cse_uniform_condition_different_blocks: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lt_i32 s0, 1 ; SI-NEXT: s_cbranch_scc1 .LBB14_2 ; SI-NEXT: ; %bb.1: ; %bb2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -825,12 +829,12 @@ define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, ptr ; ; VI-LABEL: cse_uniform_condition_different_blocks: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lt_i32 s0, 1 ; VI-NEXT: s_cbranch_scc1 .LBB14_2 ; VI-NEXT: ; %bb.1: ; %bb2 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -865,7 +869,7 @@ bb9: ; preds = %bb8, %bb4 define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc_i64_eq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_eq_u64_e64 s[4:5], s[0:1], 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -884,7 +888,7 @@ define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %ou ; ; VI-LABEL: uniform_if_scc_i64_eq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u64 s[0:1], 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -919,7 +923,7 @@ done: define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc_i64_ne: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -938,7 +942,7 @@ define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %ou ; ; VI-LABEL: uniform_if_scc_i64_ne: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -973,7 +977,7 @@ done: define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc_i64_sgt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -992,7 +996,7 @@ define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %o ; ; VI-LABEL: uniform_if_scc_i64_sgt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -1036,7 +1040,7 @@ define amdgpu_kernel void @move_to_valu_i64_eq(ptr addrspace(1) %out) { ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s0, 1 ; SI-NEXT: .LBB18_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -1055,7 +1059,7 @@ define amdgpu_kernel void @move_to_valu_i64_eq(ptr addrspace(1) %out) { ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: .LBB18_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1090,7 +1094,7 @@ define amdgpu_kernel void @move_to_valu_i64_ne(ptr addrspace(1) %out) { ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s0, 1 ; SI-NEXT: .LBB19_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -1109,7 +1113,7 @@ define amdgpu_kernel void @move_to_valu_i64_ne(ptr addrspace(1) %out) { ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s0, 1 ; VI-NEXT: .LBB19_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll index 18b2397bbd5a7e..67f760320e78ea 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-select.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX90A-LABEL: test_insert_extract: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX90A-NEXT: s_mov_b32 s2, 0 ; GFX90A-NEXT: s_and_b64 vcc, exec, -1 ; GFX90A-NEXT: s_mov_b32 s3, 0 @@ -55,7 +55,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX940-LABEL: test_insert_extract: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX940-NEXT: s_mov_b32 s2, 0 ; GFX940-NEXT: s_and_b64 vcc, exec, -1 ; GFX940-NEXT: s_mov_b32 s3, 0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX1030-LABEL: test_insert_extract: ; GFX1030: ; %bb.0: ; %entry -; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX1030-NEXT: s_mov_b32 s2, 0 ; GFX1030-NEXT: s_mov_b32 s3, 0 ; GFX1030-NEXT: s_mov_b32 s4, 0 @@ -151,7 +151,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX1100-LABEL: test_insert_extract: ; GFX1100: ; %bb.0: ; %entry -; GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1100-NEXT: s_mov_b32 s2, 0 ; GFX1100-NEXT: s_mov_b32 s3, 0 ; GFX1100-NEXT: s_mov_b32 s4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index a5e1506114f2d0..b678e3e87202a6 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -66,9 +66,9 @@ define hidden void @widget() { ; GCN-NEXT: s_mov_b32 s14, s44 ; GCN-NEXT: s_mov_b32 s15, s45 ; GCN-NEXT: s_mov_b64 s[4:5], s[34:35] +; GCN-NEXT: s_mov_b64 s[6:7], s[36:37] ; GCN-NEXT: s_mov_b64 s[8:9], s[38:39] ; GCN-NEXT: s_mov_b64 s[10:11], s[40:41] -; GCN-NEXT: s_mov_b64 s[6:7], s[36:37] ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; GCN-NEXT: s_mov_b64 s[16:17], 0 ; GCN-NEXT: s_andn2_b64 s[18:19], s[46:47], exec @@ -292,14 +292,14 @@ define hidden void @blam() { ; GCN-NEXT: v_writelane_b32 v45, s55, 23 ; GCN-NEXT: v_writelane_b32 v45, s56, 24 ; GCN-NEXT: v_writelane_b32 v45, s57, 25 -; GCN-NEXT: s_mov_b64 s[34:35], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v40, v31 ; GCN-NEXT: s_mov_b32 s46, s15 ; GCN-NEXT: s_mov_b32 s47, s14 ; GCN-NEXT: s_mov_b32 s48, s13 ; GCN-NEXT: s_mov_b32 s49, s12 -; GCN-NEXT: s_mov_b64 s[36:37], s[10:11] -; GCN-NEXT: s_mov_b64 s[38:39], s[8:9] +; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] +; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] +; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] ; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -342,9 +342,9 @@ define hidden void @blam() { ; GCN-NEXT: s_add_u32 s16, s16, spam@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, spam@rel32@hi+12 ; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] -; GCN-NEXT: s_mov_b64 s[6:7], s[34:35] -; GCN-NEXT: s_mov_b64 s[8:9], s[38:39] -; GCN-NEXT: s_mov_b64 s[10:11], s[36:37] +; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] +; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] +; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] ; GCN-NEXT: s_mov_b32 s12, s49 ; GCN-NEXT: s_mov_b32 s13, s48 ; GCN-NEXT: s_mov_b32 s14, s47 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 3f346db3f3e665..b4f977db804392 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -122,14 +122,14 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; ; GCN-IR-LABEL: s_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[4:5] +; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[6:7] ; GCN-IR-NEXT: s_flbit_i32_b64 s18, s[2:3] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GCN-IR-NEXT: s_sub_u32 s12, s10, s18 @@ -153,47 +153,47 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s14 -; GCN-IR-NEXT: s_add_u32 s16, s4, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 -; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] -; GCN-IR-NEXT: s_add_u32 s10, s6, s18 -; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 +; GCN-IR-NEXT: s_add_u32 s16, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s17, s7, -1 +; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11] +; GCN-IR-NEXT: s_add_u32 s10, s4, s18 +; GCN-IR-NEXT: s_addc_u32 s11, s5, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 -; GCN-IR-NEXT: s_mov_b32 s7, 0 +; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s6, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s6, s17, s13 -; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 +; GCN-IR-NEXT: s_sub_u32 s4, s16, s12 +; GCN-IR-NEXT: s_subb_u32 s4, s17, s13 +; GCN-IR-NEXT: s_ashr_i32 s14, s4, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s6, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[4:5] +; GCN-IR-NEXT: s_and_b32 s4, s14, 1 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s14 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s15 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[14:15], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0 ; GCN-IR-NEXT: s_mov_b32 s12, s0 -; GCN-IR-NEXT: s_mul_i32 s0, s4, s9 +; GCN-IR-NEXT: s_mul_i32 s0, s6, s9 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s5, s8 +; GCN-IR-NEXT: s_mul_i32 s0, s7, s8 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s4, s8 +; GCN-IR-NEXT: s_mul_i32 s0, s6, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 @@ -413,7 +413,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem31_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -424,7 +424,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 1 @@ -448,7 +448,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_urem31_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -459,7 +459,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 @@ -490,112 +490,112 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem31_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_lshr_b32 s0, s9, 1 +; GCN-NEXT: s_lshr_b32 s0, s13, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GCN-NEXT: s_sub_i32 s1, 0, s0 -; GCN-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-NEXT: s_lshr_b32 s8, s7, 1 +; GCN-NEXT: s_lshr_b32 s6, s15, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_lshr_b32 s7, s11, 1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-NEXT: s_lshr_b32 s1, s11, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-NEXT: s_lshr_b32 s1, s9, 1 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_mul_i32 s5, s5, s0 -; GCN-NEXT: s_sub_i32 s4, s4, s5 -; GCN-NEXT: s_sub_i32 s5, s4, s0 -; GCN-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-NEXT: s_sub_i32 s5, s4, s0 -; GCN-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-NEXT: s_cselect_b32 s0, s5, s4 -; GCN-NEXT: s_sub_i32 s4, 0, s1 -; GCN-NEXT: v_mul_lo_u32 v0, s4, v1 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s2, s2, s0 +; GCN-NEXT: s_sub_i32 s1, s1, s2 +; GCN-NEXT: s_sub_i32 s2, s1, s0 +; GCN-NEXT: s_cmp_ge_u32 s1, s0 +; GCN-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-NEXT: s_sub_i32 s2, s1, s0 +; GCN-NEXT: s_cmp_ge_u32 s1, s0 +; GCN-NEXT: s_cselect_b32 s8, s2, s1 +; GCN-NEXT: s_sub_i32 s0, 0, s6 +; GCN-NEXT: v_mul_lo_u32 v0, s0, v1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mul_hi_u32 v2, s7, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-NEXT: s_mul_i32 s0, s0, s1 -; GCN-NEXT: s_sub_i32 s0, s8, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s1 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s1 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NEXT: s_mul_i32 s4, s4, s6 +; GCN-NEXT: s_sub_i32 s4, s7, s4 +; GCN-NEXT: s_sub_i32 s5, s4, s6 +; GCN-NEXT: s_cmp_ge_u32 s4, s6 +; GCN-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-NEXT: s_sub_i32 s5, s4, s6 +; GCN-NEXT: s_cmp_ge_u32 s4, s6 +; GCN-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem31_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_lshr_b32 s0, s9, 1 +; GCN-IR-NEXT: s_lshr_b32 s0, s13, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GCN-IR-NEXT: s_sub_i32 s1, 0, s0 -; GCN-IR-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-IR-NEXT: s_lshr_b32 s8, s7, 1 +; GCN-IR-NEXT: s_lshr_b32 s6, s15, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_lshr_b32 s7, s11, 1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s11, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-IR-NEXT: s_lshr_b32 s1, s9, 1 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s1, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-IR-NEXT: s_mul_i32 s5, s5, s0 -; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 -; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-IR-NEXT: s_cselect_b32 s0, s5, s4 -; GCN-IR-NEXT: s_sub_i32 s4, 0, s1 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v1 -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-IR-NEXT: s_mul_i32 s2, s2, s0 +; GCN-IR-NEXT: s_sub_i32 s1, s1, s2 +; GCN-IR-NEXT: s_sub_i32 s2, s1, s0 +; GCN-IR-NEXT: s_cmp_ge_u32 s1, s0 +; GCN-IR-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-IR-NEXT: s_sub_i32 s2, s1, s0 +; GCN-IR-NEXT: s_cmp_ge_u32 s1, s0 +; GCN-IR-NEXT: s_cselect_b32 s8, s2, s1 +; GCN-IR-NEXT: s_sub_i32 s0, 0, s6 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s0, v1 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s7, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-IR-NEXT: s_mul_i32 s0, s0, s1 -; GCN-IR-NEXT: s_sub_i32 s0, s8, s0 -; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 +; GCN-IR-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-IR-NEXT: s_mul_i32 s4, s4, s6 +; GCN-IR-NEXT: s_sub_i32 s4, s7, s4 +; GCN-IR-NEXT: s_sub_i32 s5, s4, s6 +; GCN-IR-NEXT: s_cmp_ge_u32 s4, s6 +; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-IR-NEXT: s_sub_i32 s5, s4, s6 +; GCN-IR-NEXT: s_cmp_ge_u32 s4, s6 +; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s4 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, @@ -607,11 +607,11 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem24_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_lshr_b32 s4, s4, 8 +; GCN-NEXT: s_lshr_b32 s4, s6, 8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: s_lshr_b32 s5, s3, 8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 @@ -632,11 +632,11 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_urem24_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s6, s[4:5], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: s_lshr_b32 s4, s4, 8 +; GCN-IR-NEXT: s_lshr_b32 s4, s6, 8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-IR-NEXT: s_lshr_b32 s5, s3, 8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s5 @@ -664,112 +664,112 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem23_64_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_lshr_b32 s0, s9, 1 +; GCN-NEXT: s_lshr_b32 s0, s13, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GCN-NEXT: s_sub_i32 s1, 0, s0 -; GCN-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-NEXT: s_lshr_b32 s8, s7, 9 +; GCN-NEXT: s_lshr_b32 s6, s15, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_lshr_b32 s7, s11, 9 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-NEXT: s_lshr_b32 s1, s11, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-NEXT: s_lshr_b32 s1, s9, 1 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_mul_i32 s5, s5, s0 -; GCN-NEXT: s_sub_i32 s4, s4, s5 -; GCN-NEXT: s_sub_i32 s5, s4, s0 -; GCN-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-NEXT: s_sub_i32 s5, s4, s0 -; GCN-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-NEXT: s_cselect_b32 s0, s5, s4 -; GCN-NEXT: s_sub_i32 s4, 0, s1 -; GCN-NEXT: v_mul_lo_u32 v0, s4, v1 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: s_mul_i32 s2, s2, s0 +; GCN-NEXT: s_sub_i32 s1, s1, s2 +; GCN-NEXT: s_sub_i32 s2, s1, s0 +; GCN-NEXT: s_cmp_ge_u32 s1, s0 +; GCN-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-NEXT: s_sub_i32 s2, s1, s0 +; GCN-NEXT: s_cmp_ge_u32 s1, s0 +; GCN-NEXT: s_cselect_b32 s8, s2, s1 +; GCN-NEXT: s_sub_i32 s0, 0, s6 +; GCN-NEXT: v_mul_lo_u32 v0, s0, v1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mul_hi_u32 v2, s7, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-NEXT: s_mul_i32 s0, s0, s1 -; GCN-NEXT: s_sub_i32 s0, s8, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s1 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s1 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NEXT: s_mul_i32 s4, s4, s6 +; GCN-NEXT: s_sub_i32 s4, s7, s4 +; GCN-NEXT: s_sub_i32 s5, s4, s6 +; GCN-NEXT: s_cmp_ge_u32 s4, s6 +; GCN-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-NEXT: s_sub_i32 s5, s4, s6 +; GCN-NEXT: s_cmp_ge_u32 s4, s6 +; GCN-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem23_64_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_lshr_b32 s0, s9, 1 +; GCN-IR-NEXT: s_lshr_b32 s0, s13, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GCN-IR-NEXT: s_sub_i32 s1, 0, s0 -; GCN-IR-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-IR-NEXT: s_lshr_b32 s8, s7, 9 +; GCN-IR-NEXT: s_lshr_b32 s6, s15, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_lshr_b32 s7, s11, 9 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s11, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-IR-NEXT: s_lshr_b32 s1, s9, 1 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s1, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-IR-NEXT: s_mul_i32 s5, s5, s0 -; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 -; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-IR-NEXT: s_cselect_b32 s0, s5, s4 -; GCN-IR-NEXT: s_sub_i32 s4, 0, s1 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v1 -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-IR-NEXT: s_mul_i32 s2, s2, s0 +; GCN-IR-NEXT: s_sub_i32 s1, s1, s2 +; GCN-IR-NEXT: s_sub_i32 s2, s1, s0 +; GCN-IR-NEXT: s_cmp_ge_u32 s1, s0 +; GCN-IR-NEXT: s_cselect_b32 s1, s2, s1 +; GCN-IR-NEXT: s_sub_i32 s2, s1, s0 +; GCN-IR-NEXT: s_cmp_ge_u32 s1, s0 +; GCN-IR-NEXT: s_cselect_b32 s8, s2, s1 +; GCN-IR-NEXT: s_sub_i32 s0, 0, s6 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s0, v1 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s7, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-IR-NEXT: s_mul_i32 s0, s0, s1 -; GCN-IR-NEXT: s_sub_i32 s0, s8, s0 -; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 +; GCN-IR-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-IR-NEXT: s_mul_i32 s4, s4, s6 +; GCN-IR-NEXT: s_sub_i32 s4, s7, s4 +; GCN-IR-NEXT: s_sub_i32 s5, s4, s6 +; GCN-IR-NEXT: s_cmp_ge_u32 s4, s6 +; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-IR-NEXT: s_sub_i32 s5, s4, s6 +; GCN-IR-NEXT: s_cmp_ge_u32 s4, s6 +; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s4 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, @@ -781,7 +781,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -885,7 +885,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_urem_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] @@ -965,7 +965,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0xaaaaaaab ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaaa ; GCN-NEXT: s_mov_b32 s7, 0xf000 @@ -1002,7 +1002,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_urem_k_den_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] ; GCN-IR-NEXT: s_sub_u32 s8, 59, s12 @@ -1353,7 +1353,7 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s5, 0x41c00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s2, -1 @@ -1376,7 +1376,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_urem24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_mov_b32 s5, 0x41c00000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s2, -1 @@ -1405,7 +1405,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_urem24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_movk_i32 s4, 0x5b7f ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -1430,7 +1430,7 @@ define amdgpu_kernel void @s_test_urem24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_urem24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_movk_i32 s4, 0x5b7f ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index cf13bb2efcae7c..2f4f08175be0ed 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -7,60 +7,60 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { ; SI-LABEL: s_usubo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_sub_u32 s4, s6, s8 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_subb_u32 s5, s7, s9 -; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_sub_u32 s0, s2, s8 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_subb_u32 s1, s3, s9 +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_usubo_i64_zext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_sub_u32 s0, s6, s0 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: s_subb_u32 s1, s7, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_u32 s0, s2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_subb_u32 s1, s3, s5 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_usubo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_sub_u32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_subb_u32 s1, s7, s1 -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_sub_u32 s4, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_subb_u32 s5, s3, s7 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0 %val = extractvalue { i64, i1 } %usub, 0 @@ -75,35 +75,35 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_usubo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: v_mov_b32_e32 v0, s13 ; SI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_usubo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_sub_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: flat_store_byte v[2:3], v5 @@ -111,15 +111,15 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_usubo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: global_store_byte v0, v2, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 @@ -132,7 +132,7 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_usubo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -182,16 +182,16 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[8:9] -; GFX9-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NEXT: global_load_dword v1, v0, s[12:13] +; GFX9-NEXT: global_load_dword v2, v0, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: global_store_byte v0, v2, s[6:7] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-NEXT: global_store_byte v0, v2, s[10:11] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -210,7 +210,7 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i32_novcc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -243,7 +243,7 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_usubo_i32_novcc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -268,19 +268,19 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_usubo_i32_novcc: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[8:9] -; GFX9-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NEXT: global_load_dword v1, v0, s[12:13] +; GFX9-NEXT: global_load_dword v2, v0, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: global_store_byte v0, v2, s[6:7] +; GFX9-NEXT: global_store_byte v0, v2, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -301,7 +301,7 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; SI-LABEL: s_usubo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -325,7 +325,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_usubo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, s4, s6 @@ -345,19 +345,19 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_usubo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s8, s10 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_subb_u32 s1, s9, s11 +; GFX9-NEXT: s_sub_u32 s0, s12, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_subb_u32 s1, s13, s15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] -; GFX9-NEXT: global_store_byte v4, v0, s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] +; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %usub, 0 @@ -370,7 +370,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -401,7 +401,7 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_usubo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -424,18 +424,18 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_byte v4, v0, s[6:7] +; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -454,7 +454,7 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -486,7 +486,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_usubo_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -510,17 +510,17 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[8:9] -; GFX9-NEXT: global_load_ushort v2, v0, s[10:11] +; GFX9-NEXT: global_load_ushort v1, v0, s[12:13] +; GFX9-NEXT: global_load_ushort v2, v0, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v2, v1, v2 ; GFX9-NEXT: v_cmp_gt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX9-NEXT: global_store_short v0, v2, s[4:5] -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: global_store_short v0, v2, s[8:9] +; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -539,7 +539,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_usubo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -570,7 +570,7 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_usubo_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -593,18 +593,18 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_usubo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4 @@ -620,7 +620,7 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_usubo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 @@ -630,7 +630,7 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 ; SI-NEXT: .LBB8_2: ; %exit -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] @@ -647,7 +647,7 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: s_usubo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: s_cmp_eq_u32 s0, s1 @@ -657,7 +657,7 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: s_xor_b64 s[0:1], vcc, -1 ; VI-NEXT: .LBB8_2: ; %exit -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -670,7 +670,7 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: s_usubo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_cmp_eq_u32 s0, s1 @@ -680,12 +680,12 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1 ; GFX9-NEXT: .LBB8_2: ; %exit -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] -; GFX9-NEXT: global_store_byte v1, v2, s[6:7] +; GFX9-NEXT: global_store_dword v1, v0, s[8:9] +; GFX9-NEXT: global_store_byte v1, v2, s[10:11] ; GFX9-NEXT: s_endpgm entry: %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) @@ -709,7 +709,7 @@ exit: define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s14, s2 @@ -743,7 +743,7 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_usubo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24 ; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -770,12 +770,12 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_usubo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] +; GFX9-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-NEXT: global_load_dword v3, v0, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX9-NEXT: v_sub_co_u32_e64 v1, s[0:1], v2, v3 @@ -783,9 +783,9 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: ; %bb.1: ; %if ; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 ; GFX9-NEXT: .LBB9_2: ; %exit -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll index 1bb76cf547e25b..651650fcb7a54f 100644 --- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll +++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll @@ -25,16 +25,16 @@ bb: define amdgpu_kernel void @test_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { ; GFX9-LABEL: test_add_co_sdwa: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v2, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[4:5] +; GFX9-NEXT: global_load_dword v4, v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll index 39c7538738eb15..5b40d53e0a81c4 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll @@ -23,7 +23,7 @@ entry: define amdgpu_kernel void @fcmp_test(half %x, half %y) { ; CHECK-LABEL: fcmp_test: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_b32 s0, s[2:3], 0x0 +; CHECK-NEXT: s_load_b32 s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s1, s0, 16 @@ -46,7 +46,7 @@ entry: define amdgpu_kernel void @ballot_test(half %x, half %y) { ; CHECK-LABEL: ballot_test: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_b32 s0, s[2:3], 0x0 +; CHECK-NEXT: s_load_b32 s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s1, s0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index e40f1e89afd28c..f20c1ccb2d63eb 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -13,9 +13,9 @@ declare double @llvm.fabs.f64(double) define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cnd_nan_nosgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s3 @@ -33,17 +33,17 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; ; VI-LABEL: v_cnd_nan_nosgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s4, 0 +; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc @@ -54,17 +54,17 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; ; GFX10-LABEL: v_cnd_nan_nosgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cmp_eq_u32 s2, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -73,7 +73,7 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; ; GFX11-LABEL: v_cnd_nan_nosgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -81,10 +81,10 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cmp_eq_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -107,7 +107,7 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 { ; SI-LABEL: v_cnd_nan: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -122,7 +122,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; VI-LABEL: v_cnd_nan: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -135,18 +135,18 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; GFX10-LABEL: v_cnd_nan: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s6, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s7, s[0:1] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_cmp_eq_u32 s2, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cnd_nan: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s2, 0 @@ -167,8 +167,8 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -182,8 +182,8 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -198,20 +198,20 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[2:3] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -232,8 +232,8 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -247,8 +247,8 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -263,26 +263,26 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s6, s[2:3] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s6, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -297,8 +297,8 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -312,8 +312,8 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -328,20 +328,20 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[2:3] -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -362,8 +362,8 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -377,8 +377,8 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -393,26 +393,26 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s6, s[2:3] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s6, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -427,33 +427,33 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s6, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -466,16 +466,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -483,17 +483,17 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -512,33 +512,33 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s6, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -551,16 +551,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -568,17 +568,17 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -597,37 +597,37 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: v_mov_b32_e32 v3, s8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc @@ -636,32 +636,30 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s0, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s0, vcc -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -677,43 +675,43 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v5 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc @@ -723,35 +721,35 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -769,43 +767,43 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, 2, v3, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5 ; VI-NEXT: v_cndmask_b32_e32 v2, 2, v2, vcc @@ -815,35 +813,35 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -861,44 +859,44 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, 2, v4, vcc -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc @@ -909,37 +907,37 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc -; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -957,49 +955,49 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_mov_b32_e32 v5, v2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 ; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc ; VI-NEXT: flat_load_dword v6, v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v5 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; VI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 ; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1012,43 +1010,43 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1066,49 +1064,49 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_mov_b32_e32 v5, v2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 ; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc ; VI-NEXT: flat_load_dword v6, v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v5 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; VI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 ; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1121,43 +1119,43 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1177,49 +1175,49 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_mov_b32_e32 v5, v2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 ; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc ; VI-NEXT: flat_load_dword v6, v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v5 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; VI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 ; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1232,43 +1230,43 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1286,47 +1284,47 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[2:3], s[8:11], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[4:5], s[10:11] +; SI-NEXT: buffer_load_dword v2, v[2:3], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] ; SI-NEXT: v_and_b32_e32 v3, 1, v3 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; SI-NEXT: buffer_store_byte v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_byte v2, v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: flat_load_dword v2, v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_ubyte v3, v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; VI-NEXT: v_and_b32_e32 v3, 1, v3 @@ -1339,11 +1337,11 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v2, v1, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v2, v1, s[10:11] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1352,19 +1350,19 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX10-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX10-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-NEXT: global_store_byte v0, v1, s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v1, s[10:11] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1375,7 +1373,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX11-NEXT: global_store_b8 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b8 v0, v1, s[8:9] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1394,48 +1392,48 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0x3ff00000 -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2 ; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc ; VI-NEXT: flat_load_dword v6, v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v5 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v5 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_mov_b32_e32 v4, 0x3ff00000 ; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v6 @@ -1447,39 +1445,39 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v4, v2, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v3, v1, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v3 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1498,47 +1496,47 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2 ; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc ; VI-NEXT: flat_load_dword v6, v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v5 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v5 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v6 ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1549,39 +1547,39 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v4, v2, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v3, v1, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1600,43 +1598,43 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, 4.0, v3, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v5 ; VI-NEXT: v_cndmask_b32_e32 v2, 4.0, v2, vcc @@ -1646,35 +1644,35 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1693,47 +1691,47 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v2 ; SI-NEXT: v_cndmask_b32_e64 v2, v3, -1.0, vcc ; SI-NEXT: v_cndmask_b32_e64 v3, v3, -2.0, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v5 ; VI-NEXT: v_cndmask_b32_e64 v3, v2, -1.0, vcc @@ -1747,42 +1745,42 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[4:5] +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc -; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] dlc +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] dlc +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1804,18 +1802,18 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 @@ -1823,22 +1821,22 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1851,17 +1849,17 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v0, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1872,7 +1870,7 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; ; GFX11-LABEL: v_cndmask_abs_neg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1880,10 +1878,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1906,9 +1904,9 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s3 @@ -1926,17 +1924,17 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; ; VI-LABEL: v_cndmask_abs_neg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3] @@ -1947,17 +1945,17 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] @@ -1966,7 +1964,7 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; ; GFX11-LABEL: v_cndmask_abs_neg_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1974,10 +1972,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] @@ -1997,18 +1995,18 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dword s8, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 @@ -2016,22 +2014,22 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 @@ -2045,17 +2043,17 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 @@ -2067,7 +2065,7 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; ; GFX11-LABEL: v_cndmask_abs_neg_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -2075,10 +2073,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll index 691009e9c58d1b..ee16dad2d7d11c 100644 --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -6,70 +6,70 @@ define amdgpu_kernel void @madak_f16( ; SI-LABEL: madak_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_madak_f32 v0, v0, v1, 0x41200000 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: madak_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_madak_f16 v0, v0, v1, 0x4900 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: madak_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 -; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -93,32 +93,32 @@ entry: define amdgpu_kernel void @madak_f16_use_2( ; SI-LABEL: madak_f16_use_2: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s18, s14 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s16, s8 -; SI-NEXT: s_mov_b32 s17, s9 -; SI-NEXT: s_mov_b32 s19, s15 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s16, s12 +; SI-NEXT: s_mov_b32 s17, s13 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s12, s14 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, 0x41200000 -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -126,78 +126,78 @@ define amdgpu_kernel void @madak_f16_use_2( ; SI-NEXT: v_mac_f32_e32 v3, v0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: buffer_store_short v0, off, s[12:15], 0 -; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: madak_f16_use_2: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 -; VI-NEXT: s_mov_b32 s18, s14 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x44 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s8 -; VI-NEXT: s_mov_b32 s17, s9 -; VI-NEXT: s_mov_b32 s19, s15 -; VI-NEXT: s_mov_b32 s8, s10 -; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s14 -; VI-NEXT: s_mov_b32 s11, s15 -; VI-NEXT: s_mov_b32 s2, s14 -; VI-NEXT: s_mov_b32 s3, s15 +; VI-NEXT: s_mov_b32 s16, s12 +; VI-NEXT: s_mov_b32 s17, s13 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s12, s14 +; VI-NEXT: s_mov_b32 s13, s15 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, 0x4900 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s0, s8 +; VI-NEXT: s_mov_b32 s1, s9 +; VI-NEXT: s_mov_b32 s4, s10 +; VI-NEXT: s_mov_b32 s5, s11 ; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900 ; VI-NEXT: v_mac_f16_e32 v3, v0, v2 -; VI-NEXT: buffer_store_short v1, off, s[12:15], 0 -; VI-NEXT: buffer_store_short v3, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v3, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: madak_f16_use_2: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 -; GFX11-NEXT: s_mov_b32 s14, -1 -; GFX11-NEXT: s_mov_b32 s15, 0x31016000 -; GFX11-NEXT: s_mov_b32 s18, s14 -; GFX11-NEXT: s_mov_b32 s19, s15 -; GFX11-NEXT: s_mov_b32 s22, s14 -; GFX11-NEXT: s_mov_b32 s23, s15 -; GFX11-NEXT: s_mov_b32 s2, s14 -; GFX11-NEXT: s_mov_b32 s3, s15 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s6 +; GFX11-NEXT: s_mov_b32 s19, s7 +; GFX11-NEXT: s_mov_b32 s22, s6 +; GFX11-NEXT: s_mov_b32 s23, s7 +; GFX11-NEXT: s_mov_b32 s2, s6 +; GFX11-NEXT: s_mov_b32 s3, s7 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s16, s8 -; GFX11-NEXT: s_mov_b32 s17, s9 -; GFX11-NEXT: s_mov_b32 s20, s10 -; GFX11-NEXT: s_mov_b32 s21, s11 +; GFX11-NEXT: s_mov_b32 s16, s12 +; GFX11-NEXT: s_mov_b32 s17, s13 +; GFX11-NEXT: s_mov_b32 s20, s14 +; GFX11-NEXT: s_mov_b32 s21, s15 ; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_load_u16 v2, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s4 -; GFX11-NEXT: s_mov_b32 s13, s5 -; GFX11-NEXT: s_mov_b32 s0, s6 -; GFX11-NEXT: s_mov_b32 s1, s7 +; GFX11-NEXT: s_mov_b32 s4, s8 +; GFX11-NEXT: s_mov_b32 s5, s9 +; GFX11-NEXT: s_mov_b32 s0, s10 +; GFX11-NEXT: s_mov_b32 s1, s11 ; GFX11-NEXT: v_mul_f16_e32 v1, v0, v1 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f16_e32 v1, 0x4900, v1 ; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0 -; GFX11-NEXT: buffer_store_b16 v1, off, s[12:15], 0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r0, diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 5a7cce39f61032..2eba67b06bae1d 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -7,12 +7,12 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc +; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc +; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -24,12 +24,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace ; ; GISEL-LABEL: v_pack_b32_v2f16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc +; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc +; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -56,12 +56,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32_v2f16_sub: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc +; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc +; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -73,12 +73,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs ; ; GISEL-LABEL: v_pack_b32_v2f16_sub: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc +; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc +; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_subrev_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -105,36 +105,36 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs define amdgpu_kernel void @fptrunc( ; GCN-LABEL: fptrunc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0x31016000 -; GCN-NEXT: s_mov_b32 s10, s2 -; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0x31016000 +; GCN-NEXT: s_mov_b32 s10, s6 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s6 -; GCN-NEXT: s_mov_b32 s9, s7 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s8, s2 +; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GISEL-LABEL: fptrunc: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GISEL-NEXT: s_mov_b32 s6, -1 -; GISEL-NEXT: s_mov_b32 s7, 0x31016000 +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0 -; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1 +; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3 +; GISEL-NEXT: s_mov_b32 s2, -1 +; GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { @@ -147,12 +147,12 @@ define amdgpu_kernel void @fptrunc( define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32.fabs: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc +; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc +; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -164,12 +164,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( ; ; GISEL-LABEL: v_pack_b32.fabs: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc +; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc +; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -198,12 +198,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32.fneg: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc +; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc +; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2 @@ -215,12 +215,12 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace( ; ; GISEL-LABEL: v_pack_b32.fneg: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc +; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc +; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 3cfc3dcd0efdb6..b919bf0605a121 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -89,7 +89,7 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg %src0ext, i32 inreg %src1ext) { ; SDAG-VI-LABEL: basic_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 @@ -104,20 +104,20 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_med3_i16 v2, s6, 0, v1 -; SDAG-GFX9-NEXT: v_med3_i16 v1, s7, 0, v1 +; SDAG-GFX9-NEXT: v_med3_i16 v2, s2, 0, v1 +; SDAG-GFX9-NEXT: v_med3_i16 v1, s3, 0, v1 ; SDAG-GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SDAG-GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 -; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX9-NEXT: s_endpgm ; ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_med3_i16 v0, s2, 0, 0xff @@ -130,7 +130,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-VI-LABEL: basic_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-VI-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -154,27 +154,27 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, 0 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, 0xff +; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s6 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s7 -; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s0 -; GISEL-GFX9-NEXT: s_max_i32 s0, s3, s0 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s1 -; GISEL-GFX9-NEXT: s_min_i32 s0, s0, s1 -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3 +; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s4 +; GISEL-GFX9-NEXT: s_max_i32 s3, s3, s4 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3 +; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s5 +; GISEL-GFX9-NEXT: s_min_i32 s3, s3, s5 +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-GFX9-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -409,13 +409,13 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) { define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> inreg %src) { ; SDAG-VI-LABEL: vec_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: s_lshr_b32 s2, s4, 16 -; SDAG-VI-NEXT: v_max_i16_e64 v1, s4, 0 -; SDAG-VI-NEXT: v_max_i16_e64 v2, s2, 0 +; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16 +; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 +; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 ; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 ; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 @@ -426,12 +426,12 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; SDAG-GFX9-LABEL: vec_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; SDAG-GFX9-NEXT: s_movk_i32 s2, 0xff +; SDAG-GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; SDAG-GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_pk_max_i16 v1, s4, 0 +; SDAG-GFX9-NEXT: v_pk_max_i16 v1, s2, 0 +; SDAG-GFX9-NEXT: s_movk_i32 s2, 0xff ; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s2 op_sel_hi:[1,0] ; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX9-NEXT: s_endpgm @@ -439,11 +439,11 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; SDAG-GFX11-LABEL: vec_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s4, 0 +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s2, 0 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] ; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -451,24 +451,24 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; GISEL-VI-LABEL: vec_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GISEL-VI-NEXT: s_sext_i32_i16 s2, 0 +; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GISEL-VI-NEXT: s_sext_i32_i16 s3, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: s_lshr_b32 s3, s4, 16 +; GISEL-VI-NEXT: s_lshr_b32 s4, s2, 16 +; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, s4 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-VI-NEXT: s_max_i32 s4, s4, s2 -; GISEL-VI-NEXT: s_max_i32 s2, s3, s2 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s4 +; GISEL-VI-NEXT: s_max_i32 s2, s2, s3 +; GISEL-VI-NEXT: s_max_i32 s3, s4, s3 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0xff +; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 ; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-VI-NEXT: s_min_i32 s2, s2, s4 ; GISEL-VI-NEXT: s_min_i32 s3, s3, s4 -; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 +; GISEL-VI-NEXT: s_min_i32 s2, s2, s4 ; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 -; GISEL-VI-NEXT: s_lshl_b32 s2, s2, 16 -; GISEL-VI-NEXT: s_or_b32 s2, s3, s2 +; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 +; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 +; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -477,16 +477,16 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; GISEL-GFX9-LABEL: vec_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, 0 +; GISEL-GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, 0 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GISEL-GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GISEL-GFX9-NEXT: s_max_i32 s2, s3, s2 -; GISEL-GFX9-NEXT: s_max_i32 s3, s4, 0 -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GISEL-GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GISEL-GFX9-NEXT: s_max_i32 s3, s4, s3 +; GISEL-GFX9-NEXT: s_max_i32 s2, s2, 0 +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s2 ; GISEL-GFX9-NEXT: s_ashr_i32 s2, s2, 16 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0xff00ff @@ -500,17 +500,17 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; GISEL-GFX11-LABEL: vec_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s2, 0 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s4 -; GISEL-GFX11-NEXT: s_ashr_i32 s4, s4, 16 -; GISEL-GFX11-NEXT: s_max_i32 s2, s3, s2 -; GISEL-GFX11-NEXT: s_max_i32 s3, s4, 0 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2 +; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16 +; GISEL-GFX11-NEXT: s_max_i32 s3, s4, s3 +; GISEL-GFX11-NEXT: s_max_i32 s2, s2, 0 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0xff00ff ; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2 ; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16 diff --git a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll index 973b7f5c049871..91234441f4a0e0 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll @@ -25,16 +25,16 @@ bb: define amdgpu_kernel void @test_sub_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { ; GFX9-LABEL: test_sub_co_sdwa: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v2, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[4:5] +; GFX9-NEXT: global_load_dword v4, v2, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll index f0cbeba1cfb743..bee2b706fef149 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -10,15 +10,15 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx) #1 { ; GCN-LABEL: extract_insert_same_dynelt_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 @@ -34,7 +34,7 @@ define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(ptr addrspace(1) %ou define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx0, i32 %idx1) #1 { ; GCN-LABEL: extract_insert_different_dynelt_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 @@ -84,15 +84,15 @@ define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(ptr addrspace(1 define amdgpu_kernel void @extract_insert_same_elt2_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx) #1 { ; GCN-LABEL: extract_insert_same_elt2_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 @@ -108,20 +108,20 @@ define amdgpu_kernel void @extract_insert_same_elt2_v4i32(ptr addrspace(1) %out, define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in, float %val, i32 %idx) #1 { ; GCN-LABEL: extract_insert_same_dynelt_v4f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xd -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[4:7], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] ; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: buffer_store_dword v0, v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: buffer_store_dword v0, v[4:5], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index bdde260ff8bd5e..e408e83da1c298 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1585,8 +1585,8 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) { ; GFX9-LABEL: fma_shuffle_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] @@ -1603,8 +1603,8 @@ define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly ; GFX10-LABEL: fma_shuffle_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 @@ -1622,23 +1622,23 @@ define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly ; GFX11-LABEL: fma_shuffle_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_load_b64 v[0:1], v6, s[4:5] -; GFX11-NEXT: global_load_b64 v[2:3], v6, s[6:7] -; GFX11-NEXT: global_load_b64 v[4:5], v6, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[0:1] +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] ; GFX11-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX11-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1713,7 +1713,7 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) % define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ptr addrspace(1) %out) { ; GFX9-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 @@ -1727,7 +1727,7 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ; ; GFX10-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 @@ -1741,7 +1741,7 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ; ; GFX11-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -4235,15 +4235,15 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) { ; GFX9-LABEL: fma_shuffle_v2bf16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: s_mov_b32 s3, 0x7060302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[0:1] -; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[8:9] -; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[10:11] +; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[4:5] +; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -4319,14 +4319,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-LABEL: fma_shuffle_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9] -; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[10:11] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[4:5] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4402,8 +4402,8 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-LABEL: fma_shuffle_v2bf16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 00baea8ed7a275..2d3b34e9bddec8 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -462,9 +462,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-LABEL: name: livevariables_update_missed_block ; SI: bb.0.entry: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) - ; SI-NEXT: liveins: $vgpr0, $sgpr2_sgpr3 + ; SI-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr2_sgpr3 + ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr4_sgpr5 ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 ; SI-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY1]](s32), implicit $exec ; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -473,9 +473,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: bb.1.if.then: ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: early-clobber %33:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4) - ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %33.sub0, killed %54, 0, implicit $exec - ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed %33.sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; SI-NEXT: early-clobber %34:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4) + ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %34.sub0, killed %55, 0, implicit $exec + ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed %34.sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) ; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec @@ -502,14 +502,14 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: bb.5.Flow: ; SI-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %55:vgpr_32, %bb.6 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %56:vgpr_32, %bb.6 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %41:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %42:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5 @@ -562,17 +562,17 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-LABEL: name: nested_waterfalls ; SI: bb.0.entry: ; SI-NEXT: successors: %bb.1(0x80000000) - ; SI-NEXT: liveins: $vgpr0, $sgpr2_sgpr3 + ; SI-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr2_sgpr3 + ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr4_sgpr5 ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.1.if.then: ; SI-NEXT: successors: %bb.2(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: early-clobber %10:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4) + ; SI-NEXT: early-clobber %11:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4) ; SI-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec - ; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed %10, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1) + ; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed %11, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1) ; SI-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.3 + 16, addrspace 4) ; SI-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3 ; SI-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub2 @@ -635,7 +635,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: bb.5: ; SI-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %28:vreg_64, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %29:vreg_64, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc ; SI-NEXT: SI_WATERFALL_LOOP %bb.4, implicit $exec ; SI-NEXT: {{ $}} @@ -648,7 +648,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7: ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] - ; SI-NEXT: GLOBAL_STORE_DWORD undef %31:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; SI-NEXT: GLOBAL_STORE_DWORD undef %32:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; SI-NEXT: S_ENDPGM 0 entry: %0 = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll index 2c9b53b46c098e..45ea6b62761cc1 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-LABEL: __omp_offloading_16_dd2df_main_l9: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 s0, s0, s13 +; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll index 4939d52651d96b..c779f1d548ea02 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll @@ -10,40 +10,44 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id ; CHECK-LABEL: kern: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s12, s12, s17 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; CHECK-NEXT: ; implicit-def: $vgpr40 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v40, s16, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: v_readlane_b32 s14, v40, 0 +; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] +; CHECK-NEXT: s_load_dwordx2 s[8:9], s[16:17], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v5, 42 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v3, s8 ; CHECK-NEXT: v_mov_b32_e32 v4, s9 ; CHECK-NEXT: flat_store_dword v[3:4], v5 -; CHECK-NEXT: s_mov_b64 s[16:17], 8 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: s_mov_b32 s6, s7 -; CHECK-NEXT: s_mov_b32 s9, s16 -; CHECK-NEXT: s_mov_b32 s7, s17 -; CHECK-NEXT: s_add_u32 s8, s8, s9 -; CHECK-NEXT: s_addc_u32 s6, s6, s7 +; CHECK-NEXT: s_mov_b64 s[18:19], 8 +; CHECK-NEXT: s_mov_b32 s8, s16 +; CHECK-NEXT: s_mov_b32 s9, s17 +; CHECK-NEXT: s_mov_b32 s16, s18 +; CHECK-NEXT: s_mov_b32 s15, s19 +; CHECK-NEXT: s_add_u32 s8, s8, s16 +; CHECK-NEXT: s_addc_u32 s15, s9, s15 ; CHECK-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; CHECK-NEXT: s_mov_b32 s9, s6 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, unknown_call@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, unknown_call@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; CHECK-NEXT: s_mov_b32 s9, s15 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, unknown_call@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, unknown_call@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] -; CHECK-NEXT: s_mov_b32 s6, 20 -; CHECK-NEXT: v_lshlrev_b32_e64 v2, s6, v2 -; CHECK-NEXT: s_mov_b32 s6, 10 -; CHECK-NEXT: v_lshlrev_b32_e64 v1, s6, v1 +; CHECK-NEXT: s_mov_b32 s15, 20 +; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2 +; CHECK-NEXT: s_mov_b32 s15, 10 +; CHECK-NEXT: v_lshlrev_b32_e64 v1, s15, v1 ; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 ; CHECK-NEXT: s_mov_b32 s15, 42 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index 2d5e5a9160fdf7..b5e4bcd049c42a 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -4,33 +4,33 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v3i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 -; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v4, v2, s[4:5] ; GFX906-NEXT: s_mov_b32 s4, 0xff0000 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dword v4, v2, s[0:1] +; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX906-NEXT: v_and_or_b32 v4, v4, s4, v5 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dword v0, v2, s[6:7] +; GFX906-NEXT: global_load_dword v0, v2, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX906-NEXT: v_and_or_b32 v4, v0, s4, v2 ; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[0:1] offset:2 -; GFX906-NEXT: global_store_short v1, v4, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[6:7] offset:2 +; GFX906-NEXT: global_store_short v1, v4, s[6:7] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -52,21 +52,21 @@ bb.2: define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v4i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v2, v3, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: global_load_dword v2, v3, s[0:1] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dword v2, v3, s[6:7] +; GFX906-NEXT: global_load_dword v2, v3, s[2:3] ; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v1, v2, s[0:1] +; GFX906-NEXT: global_store_dword v1, v2, s[6:7] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -88,25 +88,25 @@ bb.2: define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v5i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: global_store_byte v3, v2, s[0:1] offset:4 -; GFX906-NEXT: global_store_dword v3, v1, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: global_store_byte v3, v2, s[6:7] offset:4 +; GFX906-NEXT: global_store_dword v3, v1, s[6:7] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -128,21 +128,21 @@ bb.2: define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v8i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[2:3] ; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[6:7] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -164,21 +164,21 @@ bb.2: define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v16i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[0:1] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[6:7] +; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[2:3] ; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[6:7] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -200,25 +200,25 @@ bb.2: define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v10, 5, v0 ; GFX906-NEXT: v_mov_b32_e32 v9, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[4:5] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[0:1] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[0:1] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[6:7] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[6:7] +; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[2:3] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[2:3] ; GFX906-NEXT: .LBB5_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[0:1] offset:16 +; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[6:7] offset:16 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[6:7] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -240,16 +240,16 @@ bb.2: define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v256i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v61, 3, v0 ; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:240 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:240 ; GFX906-NEXT: s_mov_b32 s14, -1 ; GFX906-NEXT: s_mov_b32 s15, 0xe00000 -; GFX906-NEXT: s_add_u32 s12, s12, s9 +; GFX906-NEXT: s_add_u32 s12, s12, s11 ; GFX906-NEXT: s_addc_u32 s13, s13, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: v_mov_b32_e32 v4, 0 @@ -259,79 +259,79 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:224 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:224 ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[4:5] offset:208 -; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[4:5] offset:192 -; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[4:5] offset:176 -; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[4:5] offset:160 -; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[4:5] offset:144 -; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[4:5] offset:128 -; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[4:5] offset:112 -; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[4:5] offset:96 -; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[4:5] offset:80 -; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[4:5] offset:64 -; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[4:5] offset:48 -; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[4:5] offset:32 -; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[4:5] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[0:1] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[0:1] offset:192 +; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[0:1] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[0:1] offset:160 +; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[0:1] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[0:1] offset:128 +; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[0:1] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[0:1] offset:96 +; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[0:1] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[0:1] offset:64 +; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[0:1] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[0:1] offset:32 +; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[0:1] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[0:1] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] offset:240 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] offset:240 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[6:7] offset:224 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[6:7] offset:208 -; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[6:7] offset:192 -; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[6:7] offset:176 -; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[6:7] offset:160 -; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[6:7] offset:144 -; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[6:7] offset:128 -; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[6:7] offset:112 -; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[6:7] offset:96 -; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[6:7] offset:80 -; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[6:7] offset:64 -; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[6:7] offset:48 -; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[6:7] offset:32 -; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[6:7] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] +; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[2:3] offset:224 +; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[2:3] offset:208 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[2:3] offset:192 +; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[2:3] offset:176 +; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[2:3] offset:160 +; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[2:3] offset:144 +; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[2:3] offset:128 +; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[2:3] offset:112 +; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[2:3] offset:96 +; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[2:3] offset:80 +; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[2:3] offset:64 +; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[2:3] offset:48 +; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[2:3] offset:32 +; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[2:3] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] ; GFX906-NEXT: .LBB6_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112 +; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[6:7] offset:112 ; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[0:1] offset:96 +; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[6:7] offset:96 ; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[0:1] offset:80 +; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[6:7] offset:80 ; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[0:1] offset:64 +; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[6:7] offset:64 ; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[0:1] offset:48 +; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[6:7] offset:48 ; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[0:1] offset:32 +; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[6:7] offset:32 ; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[0:1] offset:16 +; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[6:7] offset:16 ; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240 -; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[0:1] offset:224 -; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[0:1] offset:208 -; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[0:1] offset:192 -; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[0:1] offset:176 -; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[0:1] offset:160 -; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[0:1] offset:144 -; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[0:1] offset:128 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:240 +; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[6:7] offset:224 +; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[6:7] offset:208 +; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[6:7] offset:192 +; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[6:7] offset:176 +; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[6:7] offset:160 +; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[6:7] offset:144 +; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[6:7] offset:128 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -353,9 +353,9 @@ bb.2: define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: repeat_successor: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s8, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX906-NEXT: s_load_dword s8, s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_cmp_lt_i32 s8, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_3 @@ -364,18 +364,18 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 ; GFX906-NEXT: ; %bb.2: ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX906-NEXT: global_load_dword v0, v0, s[4:5] +; GFX906-NEXT: global_load_dword v0, v0, s[0:1] ; GFX906-NEXT: s_branch .LBB7_5 ; GFX906-NEXT: .LBB7_3: ; %LeafBlock5 ; GFX906-NEXT: s_cmp_eq_u32 s8, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 ; GFX906-NEXT: ; %bb.4: ; %sw.bb5 ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX906-NEXT: global_load_dword v0, v0, s[6:7] +; GFX906-NEXT: global_load_dword v0, v0, s[2:3] ; GFX906-NEXT: .LBB7_5: ; %return.sink.split ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v1, v0, s[0:1] +; GFX906-NEXT: global_store_dword v1, v0, s[6:7] ; GFX906-NEXT: .LBB7_6: ; %return ; GFX906-NEXT: s_endpgm entry: @@ -405,16 +405,16 @@ return: define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_chain: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[8:9] ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB8_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[10:11] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec @@ -426,12 +426,12 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: ; %bb.3: ; %bb.2 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13] ; GFX906-NEXT: .LBB8_4: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -460,17 +460,17 @@ bb.3: define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_zeroinit: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[4:5] +; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9] ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB9_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11] ; GFX906-NEXT: s_mov_b32 s4, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX906-NEXT: s_mov_b32 s5, s4 @@ -489,12 +489,12 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa ; GFX906-NEXT: v_mov_b32_e32 v1, v3 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: v_mov_b32_e32 v2, v4 -; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9] +; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] ; GFX906-NEXT: .LBB9_4: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -522,7 +522,7 @@ bb.3: define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_const: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 @@ -534,7 +534,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: ; implicit-def: $vgpr12 ; GFX906-NEXT: ; implicit-def: $vgpr16 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[8:9] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v2 @@ -545,7 +545,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB10_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[3:4], v4, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[3:4], v4, s[10:11] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec @@ -581,7 +581,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v11, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9] +; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] ; GFX906-NEXT: v_mov_b32_e32 v3, v1 ; GFX906-NEXT: v_mov_b32_e32 v13, v10 ; GFX906-NEXT: v_mov_b32_e32 v11, v9 @@ -603,7 +603,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_mov_b32_e32 v2, 0 ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[10:11] +; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -631,31 +631,31 @@ bb.3: define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_multi_block: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[3:4], v6, s[4:5] +; GFX906-NEXT: global_load_dwordx2 v[3:4], v6, s[8:9] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, v3 ; GFX906-NEXT: v_mov_b32_e32 v2, v4 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB11_4 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[6:7] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[10:11] ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB11_3 ; GFX906-NEXT: ; %bb.2: ; %bb.2 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9] +; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] ; GFX906-NEXT: .LBB11_3: ; %Flow ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: .LBB11_4: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v5, v[1:2], s[10:11] +; GFX906-NEXT: global_store_dwordx2 v5, v[1:2], s[14:15] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -682,10 +682,10 @@ bb.3: define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_loop_carried: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX906-NEXT: v_cmp_lt_u32_e32 vcc, 14, v0 -; GFX906-NEXT: s_mov_b32 s4, 0x2000604 +; GFX906-NEXT: s_mov_b32 s2, 0x2000604 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dword v1, v1, s[0:1] ; GFX906-NEXT: s_mov_b64 s[0:1], 0 @@ -695,12 +695,12 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp ; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: s_and_b64 s[6:7], exec, vcc ; GFX906-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] -; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s2 ; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_cbranch_execnz .LBB12_1 ; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_store_dword v1, v0, s[0:1] @@ -728,13 +728,13 @@ bb.2: define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) { ; GFX906-LABEL: v8i8_multiuse_multiblock: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v1, s[4:5] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v1, s[8:9] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX906-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -757,10 +757,10 @@ define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr ; GFX906-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX906-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dword v3, v1, s[8:9] -; GFX906-NEXT: global_store_dword v3, v7, s[8:9] offset:8 -; GFX906-NEXT: global_store_dword v3, v6, s[8:9] offset:16 -; GFX906-NEXT: global_store_dword v3, v4, s[8:9] offset:24 +; GFX906-NEXT: global_store_dword v3, v1, s[12:13] +; GFX906-NEXT: global_store_dword v3, v7, s[12:13] offset:8 +; GFX906-NEXT: global_store_dword v3, v6, s[12:13] offset:16 +; GFX906-NEXT: global_store_dword v3, v4, s[12:13] offset:24 ; GFX906-NEXT: .LBB13_2: ; %Flow ; GFX906-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX906-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] @@ -782,10 +782,10 @@ define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr ; GFX906-NEXT: v_and_or_b32 v7, v1, s3, v6 ; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX906-NEXT: global_store_dword v0, v3, s[10:11] -; GFX906-NEXT: global_store_dword v0, v4, s[10:11] offset:8 -; GFX906-NEXT: global_store_dword v0, v7, s[10:11] offset:16 -; GFX906-NEXT: global_store_dword v0, v2, s[10:11] offset:24 +; GFX906-NEXT: global_store_dword v0, v3, s[14:15] +; GFX906-NEXT: global_store_dword v0, v4, s[14:15] offset:8 +; GFX906-NEXT: global_store_dword v0, v7, s[14:15] offset:16 +; GFX906-NEXT: global_store_dword v0, v2, s[14:15] offset:24 ; GFX906-NEXT: .LBB13_4: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX906-NEXT: s_movk_i32 s3, 0xff00 diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll index 3df757a426ae96..4ce71e1de039b5 100644 --- a/llvm/test/CodeGen/AMDGPU/vselect.ll +++ b/llvm/test/CodeGen/AMDGPU/vselect.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @test_select_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x i32> %val) { ; SI-LABEL: test_select_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_select_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: test_select_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 @@ -73,33 +73,33 @@ entry: define amdgpu_kernel void @test_select_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: test_select_v2f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s3 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s7, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_select_v2f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 @@ -148,56 +148,56 @@ entry: define amdgpu_kernel void @test_select_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <4 x i32> %val) { ; SI-LABEL: test_select_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x0 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; SI-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_gt_i32 s10, s14 -; SI-NEXT: s_cselect_b32 s2, s2, s10 +; SI-NEXT: s_cselect_b32 s6, s6, s10 ; SI-NEXT: s_cmp_gt_i32 s9, s13 -; SI-NEXT: s_cselect_b32 s1, s1, s9 +; SI-NEXT: s_cselect_b32 s5, s5, s9 ; SI-NEXT: s_cmp_gt_i32 s11, s15 -; SI-NEXT: s_cselect_b32 s3, s3, s11 +; SI-NEXT: s_cselect_b32 s7, s7, s11 ; SI-NEXT: s_cmp_gt_i32 s8, s12 -; SI-NEXT: s_cselect_b32 s0, s0, s8 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_cselect_b32 s4, s4, s8 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_select_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x44 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_gt_i32 s14, s18 -; VI-NEXT: s_cselect_b32 s2, s2, s14 -; VI-NEXT: s_cmp_gt_i32 s13, s17 -; VI-NEXT: s_cselect_b32 s1, s1, s13 -; VI-NEXT: s_cmp_gt_i32 s15, s19 -; VI-NEXT: s_cselect_b32 s3, s3, s15 -; VI-NEXT: s_cmp_gt_i32 s12, s16 -; VI-NEXT: s_cselect_b32 s0, s0, s12 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_cmp_gt_i32 s10, s14 +; VI-NEXT: s_cselect_b32 s0, s18, s10 +; VI-NEXT: s_cmp_gt_i32 s9, s13 +; VI-NEXT: s_cselect_b32 s1, s17, s9 +; VI-NEXT: s_cmp_gt_i32 s11, s15 +; VI-NEXT: s_cselect_b32 s2, s19, s11 +; VI-NEXT: s_cmp_gt_i32 s8, s12 +; VI-NEXT: s_cselect_b32 s3, s16, s8 +; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: test_select_v4i32: @@ -237,41 +237,41 @@ entry: define amdgpu_kernel void @test_select_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: test_select_v4f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v3 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s7, v3 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v2 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; SI-NEXT: v_mov_b32_e32 v4, s1 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, s1, v1 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s5, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, s0, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_select_v4f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll index d7db68a433319c..6133cb46907237 100644 --- a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll @@ -22,25 +22,25 @@ define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX906-NEXT: s_mov_b32 s14, -1 ; GFX906-NEXT: s_mov_b32 s15, 0xe00000 -; GFX906-NEXT: s_add_u32 s12, s12, s9 +; GFX906-NEXT: s_add_u32 s12, s12, s11 ; GFX906-NEXT: s_addc_u32 s13, s13, 0 ; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 ; GFX906-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:4 ; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:8 ; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:12 -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x1c -; GFX906-NEXT: s_mov_b64 s[2:3], exec +; GFX906-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1c +; GFX906-NEXT: s_mov_b32 s4, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_bitcmp1_b32 s4, 0 +; GFX906-NEXT: s_bitcmp1_b32 s2, 0 ; GFX906-NEXT: s_mul_i32 s0, s0, s1 ; GFX906-NEXT: v_mul_u32_u24_e32 v1, s1, v1 ; GFX906-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX906-NEXT: v_add_lshl_u32 v2, v0, v2, 4 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: s_mov_b32 s4, 0 ; GFX906-NEXT: v_mov_b32_e32 v1, v0 ; GFX906-NEXT: s_cselect_b32 s5, 1, 0 +; GFX906-NEXT: s_mov_b64 s[2:3], exec ; GFX906-NEXT: ds_write_b64 v2, v[0:1] ; GFX906-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: s_waitcnt vmcnt(3) diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 4576d829b0cb0a..9b13ce6ab69cc7 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] @@ -20,7 +20,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] @@ -53,7 +53,7 @@ define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] @@ -101,7 +101,7 @@ define amdgpu_ps void @test_vopc_vcmp(float %x) { define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_2xf16: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_2xf16: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -138,11 +138,11 @@ define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 { ; GFX1032-LABEL: test_vopc_class: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_class_f32_e64 s2, s4, 0x204 +; GFX1032-NEXT: v_cmp_class_f32_e64 s2, s2, 0x204 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm @@ -150,11 +150,11 @@ define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 { ; GFX1064-LABEL: test_vopc_class: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 0x204 +; GFX1064-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 0x204 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm @@ -169,12 +169,12 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 ; GFX1032-LABEL: test_vcmp_vcnd_f16: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, s2 ; GFX1032-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc_lo ; GFX1032-NEXT: global_store_short v1, v0, s[0:1] ; GFX1032-NEXT: s_endpgm @@ -182,12 +182,12 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 ; GFX1064-LABEL: test_vcmp_vcnd_f16: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: v_cmp_neq_f16_e64 vcc, 0x7c00, s4 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064-NEXT: v_cmp_neq_f16_e64 vcc, 0x7c00, s2 ; GFX1064-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc ; GFX1064-NEXT: global_store_short v1, v0, s[0:1] ; GFX1064-NEXT: s_endpgm @@ -200,30 +200,30 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v0, s[4:5] +; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_nge_f32_e32 vcc_lo, 0, v1 ; GFX1032-NEXT: v_cmp_nle_f32_e64 s0, 1.0, v1 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v0, s[4:5] +; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_nge_f32_e32 vcc, 0, v1 ; GFX1064-NEXT: v_cmp_nle_f32_e64 s[0:1], 1.0, v1 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s[0:1] -; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1064-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid @@ -239,30 +239,30 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v0, s[4:5] +; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v1 ; GFX1032-NEXT: v_cmp_gt_i32_e64 s0, 1, v1 ; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, s0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v0, s[4:5] +; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 ; GFX1064-NEXT: v_cmp_gt_i32_e64 s[0:1], 1, v1 ; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, s[0:1] -; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1064-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid @@ -278,30 +278,30 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v0, s[4:5] +; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v1 ; GFX1032-NEXT: v_cmp_gt_u32_e64 s0, 2, v1 ; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v0, s[4:5] +; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 3, v1 ; GFX1064-NEXT: v_cmp_gt_u32_e64 s[0:1], 2, v1 ; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, s[0:1] -; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1064-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid @@ -321,7 +321,7 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; %if -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_store_dword v0, v0, s[0:1] @@ -334,7 +334,7 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; %if -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_store_dword v0, v0, s[0:1] @@ -355,7 +355,7 @@ endif: define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_loop_with_if: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -417,7 +417,7 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_loop_with_if: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -515,43 +515,43 @@ bb13: define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_loop_with_if_else_break: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: ; %bb.1: ; %.preheader -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: ; implicit-def: $sgpr3 +; GFX1032-NEXT: s_mov_b32 s3, 0 +; GFX1032-NEXT: ; implicit-def: $sgpr4 ; GFX1032-NEXT: s_branch .LBB11_4 ; GFX1032-NEXT: .LBB11_2: ; %bb8 ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_add_i32 s2, s2, 1 +; GFX1032-NEXT: s_add_i32 s3, s3, 1 ; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] -; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s2, v1 +; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1 ; GFX1032-NEXT: s_add_u32 s0, s0, 4 ; GFX1032-NEXT: s_addc_u32 s1, s1, 0 -; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo ; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo -; GFX1032-NEXT: s_or_b32 s3, s3, s5 +; GFX1032-NEXT: s_or_b32 s4, s4, s5 ; GFX1032-NEXT: .LBB11_3: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_and_b32 s5, exec_lo, s3 -; GFX1032-NEXT: s_or_b32 s4, s5, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, s5, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: .LBB11_4: ; %bb2 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v3, v2, s[0:1] -; GFX1032-NEXT: s_or_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3 ; GFX1032-NEXT: s_cbranch_vccz .LBB11_2 ; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: ; implicit-def: $sgpr2 +; GFX1032-NEXT: ; implicit-def: $sgpr3 ; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX1032-NEXT: s_branch .LBB11_3 ; GFX1032-NEXT: .LBB11_6: ; %.loopexit @@ -564,7 +564,7 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_6 ; GFX1064-NEXT: ; %bb.1: ; %.preheader -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 @@ -631,26 +631,26 @@ bb8: define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_addc_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, v0, s6 -; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2 +; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_addc_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_add_co_u32 v0, vcc, v0, s6 -; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s7, v1, vcc -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1064-NEXT: v_add_co_u32 v0, vcc, v0, s2 +; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -664,26 +664,26 @@ bb: define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subbrev_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s6 -; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2 +; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_subbrev_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, v0, s6 -; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v1, vcc, s7, v1, vcc -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, v0, s2 +; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -697,26 +697,26 @@ bb: define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subb_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s6, v0 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_subb_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s6, v0 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s7, v1, vcc -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -730,18 +730,18 @@ bb: define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_udiv64: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_or_b64 s[2:3], s[6:7], s[4:5] -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1032-NEXT: s_or_b64 s[8:9], s[6:7], s[4:5] +; GFX1032-NEXT: s_mov_b32 s8, 0 +; GFX1032-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX1032-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX1032-NEXT: s_sub_u32 s3, 0, s4 +; GFX1032-NEXT: s_sub_u32 s9, 0, s4 ; GFX1032-NEXT: s_subb_u32 s10, 0, s5 ; GFX1032-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1032-NEXT: v_rcp_f32_e32 v0, v0 @@ -753,11 +753,11 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1032-NEXT: s_mul_i32 s11, s3, s0 -; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s1 +; GFX1032-NEXT: s_mul_i32 s11, s9, s0 +; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s1 ; GFX1032-NEXT: s_mul_i32 s12, s10, s1 ; GFX1032-NEXT: s_add_i32 s11, s13, s11 -; GFX1032-NEXT: s_mul_i32 s14, s3, s1 +; GFX1032-NEXT: s_mul_i32 s14, s9, s1 ; GFX1032-NEXT: s_add_i32 s11, s11, s12 ; GFX1032-NEXT: s_mul_hi_u32 s13, s1, s14 ; GFX1032-NEXT: s_mul_hi_u32 s15, s0, s14 @@ -777,46 +777,46 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_addc_u32 s0, s0, s11 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1032-NEXT: s_mul_i32 s11, s3, s0 -; GFX1032-NEXT: s_mul_hi_u32 s12, s3, s1 +; GFX1032-NEXT: s_mul_i32 s11, s9, s0 +; GFX1032-NEXT: s_mul_hi_u32 s12, s9, s1 ; GFX1032-NEXT: s_mul_i32 s10, s10, s1 ; GFX1032-NEXT: s_add_i32 s11, s12, s11 -; GFX1032-NEXT: s_mul_i32 s3, s3, s1 +; GFX1032-NEXT: s_mul_i32 s9, s9, s1 ; GFX1032-NEXT: s_add_i32 s11, s11, s10 -; GFX1032-NEXT: s_mul_hi_u32 s12, s0, s3 -; GFX1032-NEXT: s_mul_i32 s13, s0, s3 -; GFX1032-NEXT: s_mul_hi_u32 s3, s1, s3 +; GFX1032-NEXT: s_mul_hi_u32 s12, s0, s9 +; GFX1032-NEXT: s_mul_i32 s13, s0, s9 +; GFX1032-NEXT: s_mul_hi_u32 s9, s1, s9 ; GFX1032-NEXT: s_mul_hi_u32 s14, s1, s11 ; GFX1032-NEXT: s_mul_i32 s1, s1, s11 ; GFX1032-NEXT: s_mul_hi_u32 s10, s0, s11 -; GFX1032-NEXT: s_add_u32 s1, s3, s1 -; GFX1032-NEXT: s_addc_u32 s3, 0, s14 +; GFX1032-NEXT: s_add_u32 s1, s9, s1 +; GFX1032-NEXT: s_addc_u32 s9, 0, s14 ; GFX1032-NEXT: s_add_u32 s1, s1, s13 ; GFX1032-NEXT: s_mul_i32 s11, s0, s11 -; GFX1032-NEXT: s_addc_u32 s1, s3, s12 -; GFX1032-NEXT: s_addc_u32 s3, s10, 0 +; GFX1032-NEXT: s_addc_u32 s1, s9, s12 +; GFX1032-NEXT: s_addc_u32 s9, s10, 0 ; GFX1032-NEXT: s_add_u32 s1, s1, s11 -; GFX1032-NEXT: s_addc_u32 s3, 0, s3 +; GFX1032-NEXT: s_addc_u32 s9, 0, s9 ; GFX1032-NEXT: v_add_co_u32 v0, s1, v0, s1 ; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_addc_u32 s0, s0, s3 +; GFX1032-NEXT: s_addc_u32 s0, s0, s9 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 ; GFX1032-NEXT: s_mul_i32 s10, s6, s0 -; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s0 +; GFX1032-NEXT: s_mul_hi_u32 s9, s6, s0 ; GFX1032-NEXT: s_mul_hi_u32 s11, s7, s0 ; GFX1032-NEXT: s_mul_i32 s0, s7, s0 ; GFX1032-NEXT: s_mul_hi_u32 s12, s6, s1 ; GFX1032-NEXT: s_mul_hi_u32 s13, s7, s1 ; GFX1032-NEXT: s_mul_i32 s1, s7, s1 ; GFX1032-NEXT: s_add_u32 s10, s12, s10 -; GFX1032-NEXT: s_addc_u32 s3, 0, s3 +; GFX1032-NEXT: s_addc_u32 s9, 0, s9 ; GFX1032-NEXT: s_add_u32 s1, s10, s1 -; GFX1032-NEXT: s_addc_u32 s1, s3, s13 -; GFX1032-NEXT: s_addc_u32 s3, s11, 0 +; GFX1032-NEXT: s_addc_u32 s1, s9, s13 +; GFX1032-NEXT: s_addc_u32 s9, s11, 0 ; GFX1032-NEXT: s_add_u32 s1, s1, s0 -; GFX1032-NEXT: s_addc_u32 s3, 0, s3 +; GFX1032-NEXT: s_addc_u32 s9, 0, s9 ; GFX1032-NEXT: s_mul_hi_u32 s0, s4, s1 -; GFX1032-NEXT: s_mul_i32 s11, s4, s3 +; GFX1032-NEXT: s_mul_i32 s11, s4, s9 ; GFX1032-NEXT: s_mul_i32 s12, s4, s1 ; GFX1032-NEXT: s_add_i32 s0, s0, s11 ; GFX1032-NEXT: v_sub_co_u32 v0, s11, s6, s12 @@ -836,9 +836,9 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1032-NEXT: s_add_u32 s10, s1, 1 ; GFX1032-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1032-NEXT: s_addc_u32 s12, s3, 0 +; GFX1032-NEXT: s_addc_u32 s12, s9, 0 ; GFX1032-NEXT: s_add_u32 s13, s1, 2 -; GFX1032-NEXT: s_addc_u32 s14, s3, 0 +; GFX1032-NEXT: s_addc_u32 s14, s9, 0 ; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 ; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v0 ; GFX1032-NEXT: s_subb_u32 s0, s7, s0 @@ -854,9 +854,9 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX1032-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo -; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s2 +; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 ; GFX1032-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX1032-NEXT: .LBB15_2: ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -870,21 +870,21 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_add_i32 s0, s0, s1 ; GFX1032-NEXT: s_mul_hi_u32 s0, s6, s0 ; GFX1032-NEXT: s_mul_i32 s1, s0, s4 -; GFX1032-NEXT: s_add_i32 s2, s0, 1 +; GFX1032-NEXT: s_add_i32 s5, s0, 1 ; GFX1032-NEXT: s_sub_i32 s1, s6, s1 -; GFX1032-NEXT: s_sub_i32 s3, s1, s4 +; GFX1032-NEXT: s_sub_i32 s6, s1, s4 ; GFX1032-NEXT: s_cmp_ge_u32 s1, s4 -; GFX1032-NEXT: s_cselect_b32 s0, s2, s0 -; GFX1032-NEXT: s_cselect_b32 s1, s3, s1 -; GFX1032-NEXT: s_add_i32 s2, s0, 1 +; GFX1032-NEXT: s_cselect_b32 s0, s5, s0 +; GFX1032-NEXT: s_cselect_b32 s1, s6, s1 +; GFX1032-NEXT: s_add_i32 s5, s0, 1 ; GFX1032-NEXT: s_cmp_ge_u32 s1, s4 ; GFX1032-NEXT: s_mov_b32 s1, 0 -; GFX1032-NEXT: s_cselect_b32 s0, s2, s0 +; GFX1032-NEXT: s_cselect_b32 s0, s5, s0 ; GFX1032-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032-NEXT: .LBB15_3: ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] offset:16 +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:16 ; GFX1032-NEXT: s_endpgm ; GFX1032-NEXT: .LBB15_4: ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -892,9 +892,9 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_udiv64: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_or_b64 s[0:1], s[6:7], s[4:5] ; GFX1064-NEXT: s_mov_b32 s0, 0 @@ -903,7 +903,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX1064-NEXT: s_sub_u32 s3, 0, s4 +; GFX1064-NEXT: s_sub_u32 s9, 0, s4 ; GFX1064-NEXT: s_subb_u32 s10, 0, s5 ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v0, v0 @@ -913,92 +913,92 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s8, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064-NEXT: s_mul_i32 s1, s3, s2 -; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s0 +; GFX1064-NEXT: s_mul_i32 s1, s9, s8 +; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s0 ; GFX1064-NEXT: s_mul_i32 s11, s10, s0 ; GFX1064-NEXT: s_add_i32 s1, s12, s1 -; GFX1064-NEXT: s_mul_i32 s13, s3, s0 +; GFX1064-NEXT: s_mul_i32 s13, s9, s0 ; GFX1064-NEXT: s_add_i32 s1, s1, s11 ; GFX1064-NEXT: s_mul_hi_u32 s12, s0, s13 -; GFX1064-NEXT: s_mul_hi_u32 s14, s2, s13 -; GFX1064-NEXT: s_mul_i32 s11, s2, s13 +; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13 +; GFX1064-NEXT: s_mul_i32 s11, s8, s13 ; GFX1064-NEXT: s_mul_hi_u32 s13, s0, s1 ; GFX1064-NEXT: s_mul_i32 s0, s0, s1 -; GFX1064-NEXT: s_mul_hi_u32 s15, s2, s1 +; GFX1064-NEXT: s_mul_hi_u32 s15, s8, s1 ; GFX1064-NEXT: s_add_u32 s0, s12, s0 ; GFX1064-NEXT: s_addc_u32 s12, 0, s13 ; GFX1064-NEXT: s_add_u32 s0, s0, s11 -; GFX1064-NEXT: s_mul_i32 s1, s2, s1 +; GFX1064-NEXT: s_mul_i32 s1, s8, s1 ; GFX1064-NEXT: s_addc_u32 s0, s12, s14 ; GFX1064-NEXT: s_addc_u32 s11, s15, 0 ; GFX1064-NEXT: s_add_u32 s0, s0, s1 ; GFX1064-NEXT: s_addc_u32 s11, 0, s11 ; GFX1064-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_addc_u32 s2, s2, s11 +; GFX1064-NEXT: s_addc_u32 s8, s8, s11 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064-NEXT: s_mul_i32 s1, s3, s2 -; GFX1064-NEXT: s_mul_hi_u32 s11, s3, s0 +; GFX1064-NEXT: s_mul_i32 s1, s9, s8 +; GFX1064-NEXT: s_mul_hi_u32 s11, s9, s0 ; GFX1064-NEXT: s_mul_i32 s10, s10, s0 ; GFX1064-NEXT: s_add_i32 s1, s11, s1 -; GFX1064-NEXT: s_mul_i32 s3, s3, s0 +; GFX1064-NEXT: s_mul_i32 s9, s9, s0 ; GFX1064-NEXT: s_add_i32 s1, s1, s10 -; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s3 -; GFX1064-NEXT: s_mul_i32 s12, s2, s3 -; GFX1064-NEXT: s_mul_hi_u32 s3, s0, s3 +; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s9 +; GFX1064-NEXT: s_mul_i32 s12, s8, s9 +; GFX1064-NEXT: s_mul_hi_u32 s9, s0, s9 ; GFX1064-NEXT: s_mul_hi_u32 s13, s0, s1 ; GFX1064-NEXT: s_mul_i32 s0, s0, s1 -; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s1 -; GFX1064-NEXT: s_add_u32 s0, s3, s0 -; GFX1064-NEXT: s_addc_u32 s3, 0, s13 +; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s1 +; GFX1064-NEXT: s_add_u32 s0, s9, s0 +; GFX1064-NEXT: s_addc_u32 s9, 0, s13 ; GFX1064-NEXT: s_add_u32 s0, s0, s12 -; GFX1064-NEXT: s_mul_i32 s1, s2, s1 -; GFX1064-NEXT: s_addc_u32 s0, s3, s11 -; GFX1064-NEXT: s_addc_u32 s3, s10, 0 +; GFX1064-NEXT: s_mul_i32 s1, s8, s1 +; GFX1064-NEXT: s_addc_u32 s0, s9, s11 +; GFX1064-NEXT: s_addc_u32 s9, s10, 0 ; GFX1064-NEXT: s_add_u32 s0, s0, s1 -; GFX1064-NEXT: s_addc_u32 s3, 0, s3 +; GFX1064-NEXT: s_addc_u32 s9, 0, s9 ; GFX1064-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_addc_u32 s0, s2, s3 +; GFX1064-NEXT: s_addc_u32 s0, s8, s9 ; GFX1064-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1064-NEXT: s_mul_i32 s3, s6, s0 -; GFX1064-NEXT: s_mul_hi_u32 s2, s6, s0 +; GFX1064-NEXT: s_mul_i32 s9, s6, s0 +; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s0 ; GFX1064-NEXT: s_mul_hi_u32 s10, s7, s0 ; GFX1064-NEXT: s_mul_i32 s0, s7, s0 ; GFX1064-NEXT: s_mul_hi_u32 s11, s6, s1 ; GFX1064-NEXT: s_mul_hi_u32 s12, s7, s1 ; GFX1064-NEXT: s_mul_i32 s1, s7, s1 -; GFX1064-NEXT: s_add_u32 s3, s11, s3 -; GFX1064-NEXT: s_addc_u32 s2, 0, s2 -; GFX1064-NEXT: s_add_u32 s1, s3, s1 -; GFX1064-NEXT: s_addc_u32 s1, s2, s12 -; GFX1064-NEXT: s_addc_u32 s2, s10, 0 +; GFX1064-NEXT: s_add_u32 s9, s11, s9 +; GFX1064-NEXT: s_addc_u32 s8, 0, s8 +; GFX1064-NEXT: s_add_u32 s1, s9, s1 +; GFX1064-NEXT: s_addc_u32 s1, s8, s12 +; GFX1064-NEXT: s_addc_u32 s8, s10, 0 ; GFX1064-NEXT: s_add_u32 s10, s1, s0 -; GFX1064-NEXT: s_addc_u32 s11, 0, s2 +; GFX1064-NEXT: s_addc_u32 s11, 0, s8 ; GFX1064-NEXT: s_mul_hi_u32 s0, s4, s10 ; GFX1064-NEXT: s_mul_i32 s1, s4, s11 -; GFX1064-NEXT: s_mul_i32 s3, s4, s10 +; GFX1064-NEXT: s_mul_i32 s9, s4, s10 ; GFX1064-NEXT: s_add_i32 s12, s0, s1 -; GFX1064-NEXT: v_sub_co_u32 v0, s[0:1], s6, s3 -; GFX1064-NEXT: s_mul_i32 s2, s5, s10 -; GFX1064-NEXT: s_add_i32 s12, s12, s2 -; GFX1064-NEXT: v_sub_co_u32 v1, s[2:3], v0, s4 +; GFX1064-NEXT: v_sub_co_u32 v0, s[0:1], s6, s9 +; GFX1064-NEXT: s_mul_i32 s8, s5, s10 +; GFX1064-NEXT: s_add_i32 s12, s12, s8 +; GFX1064-NEXT: v_sub_co_u32 v1, s[8:9], v0, s4 ; GFX1064-NEXT: s_sub_i32 s13, s7, s12 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_subb_u32 s13, s13, s5 -; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 -; GFX1064-NEXT: s_subb_u32 s2, s13, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s2, s5 +; GFX1064-NEXT: s_subb_u32 s8, s13, 0 +; GFX1064-NEXT: s_cmp_ge_u32 s8, s5 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX1064-NEXT: s_cselect_b32 s3, -1, 0 -; GFX1064-NEXT: s_cmp_eq_u32 s2, s5 +; GFX1064-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s8, s5 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX1064-NEXT: s_add_u32 s2, s10, 1 -; GFX1064-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc -; GFX1064-NEXT: s_addc_u32 s3, s11, 0 +; GFX1064-NEXT: s_add_u32 s8, s10, 1 +; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc +; GFX1064-NEXT: s_addc_u32 s9, s11, 0 ; GFX1064-NEXT: s_add_u32 s13, s10, 2 ; GFX1064-NEXT: s_addc_u32 s14, s11, 0 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 @@ -1013,8 +1013,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, s14 ; GFX1064-NEXT: v_cndmask_b32_e64 v0, s7, v0, s[0:1] -; GFX1064-NEXT: v_cndmask_b32_e32 v2, s2, v2, vcc -; GFX1064-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc +; GFX1064-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc +; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc ; GFX1064-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc @@ -1031,21 +1031,21 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: s_add_i32 s0, s0, s1 ; GFX1064-NEXT: s_mul_hi_u32 s0, s6, s0 ; GFX1064-NEXT: s_mul_i32 s1, s0, s4 -; GFX1064-NEXT: s_add_i32 s2, s0, 1 +; GFX1064-NEXT: s_add_i32 s5, s0, 1 ; GFX1064-NEXT: s_sub_i32 s1, s6, s1 -; GFX1064-NEXT: s_sub_i32 s3, s1, s4 +; GFX1064-NEXT: s_sub_i32 s6, s1, s4 ; GFX1064-NEXT: s_cmp_ge_u32 s1, s4 -; GFX1064-NEXT: s_cselect_b32 s0, s2, s0 -; GFX1064-NEXT: s_cselect_b32 s1, s3, s1 -; GFX1064-NEXT: s_add_i32 s2, s0, 1 +; GFX1064-NEXT: s_cselect_b32 s0, s5, s0 +; GFX1064-NEXT: s_cselect_b32 s1, s6, s1 +; GFX1064-NEXT: s_add_i32 s5, s0, 1 ; GFX1064-NEXT: s_cmp_ge_u32 s1, s4 ; GFX1064-NEXT: s_mov_b32 s1, 0 -; GFX1064-NEXT: s_cselect_b32 s0, s2, s0 +; GFX1064-NEXT: s_cselect_b32 s0, s5, s0 ; GFX1064-NEXT: v_mov_b32_e32 v0, s0 ; GFX1064-NEXT: v_mov_b32_e32 v1, s1 ; GFX1064-NEXT: .LBB15_3: ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] offset:16 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:16 ; GFX1064-NEXT: s_endpgm ; GFX1064-NEXT: .LBB15_4: ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1063,30 +1063,30 @@ bb: define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc +; GFX1032-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_div_scale_f32 v1, s0, v2, v2, v1 -; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1032-NEXT: v_div_scale_f32 v1, s2, v2, v2, v1 +; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_scale_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc +; GFX1064-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], v2, v2, v1 -; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1064-NEXT: v_div_scale_f32 v1, s[2:3], v2, v2, v1 +; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid @@ -1104,7 +1104,7 @@ define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspa ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[2:3], v[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1120,7 +1120,7 @@ define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX1064-LABEL: test_div_scale_f64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc @@ -1128,7 +1128,7 @@ define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspa ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[2:3], v[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1188,31 +1188,31 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1032-LABEL: test_div_fmas_f32: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s6 -; GFX1032-NEXT: s_bitcmp1_b32 s7, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_bitcmp1_b32 s3, 0 ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1032-NEXT: v_div_fmas_f32 v0, s4, v0, v1 -; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] +; GFX1032-NEXT: v_div_fmas_f32 v0, s0, v0, v1 +; GFX1032-NEXT: global_store_dword v2, v0, s[6:7] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_fmas_f32: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, s5 -; GFX1064-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064-NEXT: s_bitcmp1_b32 s7, 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_bitcmp1_b32 s3, 0 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX1064-NEXT: v_div_fmas_f32 v0, s4, v0, v1 -; GFX1064-NEXT: global_store_dword v2, v0, s[0:1] +; GFX1064-NEXT: v_div_fmas_f32 v0, s0, v0, v1 +; GFX1064-NEXT: global_store_dword v2, v0, s[6:7] ; GFX1064-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 @@ -1223,35 +1223,35 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX1032-LABEL: test_div_fmas_f64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX1032-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x44 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, s8 -; GFX1032-NEXT: v_mov_b32_e32 v1, s9 -; GFX1032-NEXT: v_mov_b32_e32 v2, s10 -; GFX1032-NEXT: v_mov_b32_e32 v3, s11 +; GFX1032-NEXT: v_mov_b32_e32 v0, s12 +; GFX1032-NEXT: v_mov_b32_e32 v1, s13 +; GFX1032-NEXT: v_mov_b32_e32 v2, s14 +; GFX1032-NEXT: v_mov_b32_e32 v3, s15 ; GFX1032-NEXT: s_bitcmp1_b32 s0, 0 ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1032-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] +; GFX1032-NEXT: v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_fmas_f64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX1064-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x44 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, s8 -; GFX1064-NEXT: v_mov_b32_e32 v1, s9 -; GFX1064-NEXT: v_mov_b32_e32 v2, s10 -; GFX1064-NEXT: v_mov_b32_e32 v3, s11 +; GFX1064-NEXT: v_mov_b32_e32 v0, s12 +; GFX1064-NEXT: v_mov_b32_e32 v1, s13 +; GFX1064-NEXT: v_mov_b32_e32 v2, s14 +; GFX1064-NEXT: v_mov_b32_e32 v3, s15 ; GFX1064-NEXT: s_bitcmp1_b32 s0, 0 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX1064-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] +; GFX1064-NEXT: v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX1064-NEXT: s_endpgm %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone store double %result, ptr addrspace(1) %out, align 8 @@ -1264,18 +1264,18 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx3 v[1:3], v1, s[6:7] +; GFX1032-NEXT: global_load_dwordx3 v[1:3], v1, s[10:11] ; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; %bb ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: global_load_dword v0, v0, s[8:9] glc dlc +; GFX1032-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_b32 vcc_lo, vcc_lo, exec_lo @@ -1285,24 +1285,24 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3 -; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] offset:8 +; GFX1032-NEXT: global_store_dword v0, v1, s[8:9] offset:8 ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 ; GFX1064-NEXT: s_mov_b64 vcc, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx3 v[1:3], v1, s[6:7] +; GFX1064-NEXT: global_load_dwordx3 v[1:3], v1, s[10:11] ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; %bb ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: global_load_dword v0, v0, s[8:9] glc dlc +; GFX1064-NEXT: global_load_dword v0, v0, s[6:7] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_b64 vcc, vcc, exec @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 -; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] offset:8 +; GFX1064-NEXT: global_store_dword v0, v1, s[8:9] offset:8 ; GFX1064-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -1344,40 +1344,40 @@ exit: define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX1032-LABEL: fdiv_f32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 +; GFX1032-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX1032-NEXT: v_rcp_f32_e32 v1, v0 ; GFX1032-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX1032-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 +; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 ; GFX1032-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX1032-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX1032-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX1032-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX1032-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s7, s6 -; GFX1032-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s3, s2 +; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: fdiv_f32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_div_scale_f32 v0, s[0:1], s7, s7, s6 +; GFX1064-NEXT: v_div_scale_f32 v0, s[4:5], s3, s3, s2 ; GFX1064-NEXT: v_rcp_f32_e32 v1, v0 ; GFX1064-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX1064-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s6, s7, s6 +; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s2, s3, s2 ; GFX1064-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX1064-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX1064-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX1064-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX1064-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s7, s6 -; GFX1064-NEXT: global_store_dword v1, v0, s[4:5] +; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s3, s2 +; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1064-NEXT: s_endpgm entry: %fdiv = fdiv float %a, %b @@ -1389,41 +1389,41 @@ define amdgpu_kernel void @test_br_cc_f16( ; GFX1032-LABEL: test_br_cc_f16: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX1032-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX1032-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX1032-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v2 ; GFX1032-NEXT: s_cbranch_vccnz .LBB24_2 ; GFX1032-NEXT: ; %bb.1: ; %one -; GFX1032-NEXT: global_store_short v0, v1, s[4:5] +; GFX1032-NEXT: global_store_short v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm ; GFX1032-NEXT: .LBB24_2: ; %two -; GFX1032-NEXT: global_store_short v0, v2, s[4:5] +; GFX1032-NEXT: global_store_short v0, v2, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_br_cc_f16: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX1064-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX1064-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX1064-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v2 ; GFX1064-NEXT: s_cbranch_vccnz .LBB24_2 ; GFX1064-NEXT: ; %bb.1: ; %one -; GFX1064-NEXT: global_store_short v0, v1, s[4:5] +; GFX1064-NEXT: global_store_short v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm ; GFX1064-NEXT: .LBB24_2: ; %two -; GFX1064-NEXT: global_store_short v0, v2, s[4:5] +; GFX1064-NEXT: global_store_short v0, v2, s[0:1] ; GFX1064-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -1446,12 +1446,12 @@ two: define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 { ; GCN-LABEL: test_brcc_i1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp0_b32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB25_2 ; GCN-NEXT: ; %bb.1: ; %store -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0xde ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1473,8 +1473,8 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1032-LABEL: test_preserve_condition_undef_flag: ; GFX1032: ; %bb.0: ; %bb0 ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dword s1, s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX1032-NEXT: s_load_dword s1, s[4:5], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cmp_nlt_f32_e64 s2, s0, 1.0 ; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s1, 1.0 @@ -1493,12 +1493,12 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1064-LABEL: test_preserve_condition_undef_flag: ; GFX1064: ; %bb.0: ; %bb0 ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dword s5, s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, 1.0 -; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s5, 1.0 -; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[4:5], s4, 0 +; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[0:1], s6, 1.0 +; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, 1.0 +; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[4:5], s6, 0 ; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX1064-NEXT: s_and_b64 vcc, exec, s[0:1] @@ -1531,7 +1531,7 @@ bb2: define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX1032-NEXT: ; implicit-def: $sgpr1 ; GFX1032-NEXT: ; implicit-def: $sgpr2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1569,7 +1569,7 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; ; GFX1064-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX1064-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX1064-NEXT: ; implicit-def: $sgpr4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1634,7 +1634,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) ; GFX1032-LABEL: test_movrels_extract_neg_offset_vgpr: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo @@ -1649,7 +1649,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) ; GFX1064-LABEL: test_movrels_extract_neg_offset_vgpr: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1672,12 +1672,12 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 ; GFX1032-LABEL: test_set_inactive: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cndmask_b32_e64 v0, 42, s4, s2 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 42, s2, s3 +; GFX1032-NEXT: s_mov_b32 exec_lo, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: global_store_dword v1, v2, s[0:1] @@ -1686,11 +1686,11 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 ; GFX1064-LABEL: test_set_inactive: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cndmask_b32_e64 v0, 42, s4, s[2:3] +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 42, s6, s[2:3] ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 @@ -1705,30 +1705,30 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 { ; GFX1032-LABEL: test_set_inactive_64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s7, s0 -; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s6, s0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, s4 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s2, s4 +; GFX1032-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX1032-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_set_inactive_64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s7, s[0:1] -; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s6, s[0:1] -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5] +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s2, s[4:5] +; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 -; GFX1064-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] +; GFX1064-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX1064-NEXT: s_endpgm %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) @@ -2140,23 +2140,23 @@ main_body: define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| -; GFX1032-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] +; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| +; GFX1032-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_fcmp_i64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| -; GFX1064-NEXT: v_mov_b32_e32 v0, s0 -; GFX1064-NEXT: v_mov_b32_e32 v1, s1 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1) @@ -2168,11 +2168,11 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { ; GFX1032-LABEL: test_intr_icmp_i64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 ; GFX1032-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm @@ -2180,11 +2180,11 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { ; GFX1064-LABEL: test_intr_icmp_i64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2 ; GFX1064-NEXT: v_mov_b32_e32 v0, s2 ; GFX1064-NEXT: v_mov_b32_e32 v1, s3 ; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -2197,22 +2197,22 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| -; GFX1032-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_fcmp_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| -; GFX1064-NEXT: v_mov_b32_e32 v1, s0 -; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] +; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1) @@ -2224,11 +2224,11 @@ define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; GFX1032-LABEL: test_intr_icmp_i32: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 ; GFX1032-NEXT: v_mov_b32_e32 v1, s2 ; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm @@ -2236,11 +2236,11 @@ define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; GFX1064-LABEL: test_intr_icmp_i32: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm @@ -2356,42 +2356,42 @@ define amdgpu_ps float @test_ps_live() #0 { define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_neq_f64_e64 s2, s[0:1], 1.0 -; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX1032-NEXT: v_cmp_neq_f64_e64 s4, s[2:3], 1.0 +; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_vccnz .LBB47_2 ; GFX1032-NEXT: ; %bb.1: ; %if -; GFX1032-NEXT: v_add_f64 v[0:1], s[0:1], s[0:1] +; GFX1032-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3] ; GFX1032-NEXT: s_branch .LBB47_3 ; GFX1032-NEXT: .LBB47_2: -; GFX1032-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB47_3: ; %endif ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_neq_f64_e64 s[2:3], s[0:1], 1.0 -; GFX1064-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX1064-NEXT: v_cmp_neq_f64_e64 s[4:5], s[2:3], 1.0 +; GFX1064-NEXT: s_and_b64 vcc, exec, s[4:5] ; GFX1064-NEXT: s_cbranch_vccnz .LBB47_2 ; GFX1064-NEXT: ; %bb.1: ; %if -; GFX1064-NEXT: v_add_f64 v[0:1], s[0:1], s[0:1] +; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3] ; GFX1064-NEXT: s_branch .LBB47_3 ; GFX1064-NEXT: .LBB47_2: -; GFX1064-NEXT: v_mov_b32_e32 v0, s0 -; GFX1064-NEXT: v_mov_b32_e32 v1, s1 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064-NEXT: v_mov_b32_e32 v1, s3 ; GFX1064-NEXT: .LBB47_3: ; %endif ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm entry: %v = load double, ptr addrspace(1) %in @@ -2473,7 +2473,7 @@ main_body: define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-LABEL: icmp64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 @@ -2507,7 +2507,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; ; GFX1064-LABEL: icmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 @@ -2568,7 +2568,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-LABEL: fcmp64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 @@ -2600,7 +2600,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; ; GFX1064-LABEL: fcmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 @@ -2660,7 +2660,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1032-LABEL: icmp32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 @@ -2694,7 +2694,7 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; ; GFX1064-LABEL: icmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 @@ -2754,7 +2754,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1032-LABEL: fcmp32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 @@ -2786,7 +2786,7 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; ; GFX1064-LABEL: fcmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index 1e737680313cc0..dd03fb62b8ebb0 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -22,7 +22,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_i16_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -36,7 +36,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_i16_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load_zext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -74,7 +74,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar ; ; VI-LABEL: widen_i16_constant_load_zext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -89,7 +89,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar ; ; GFX11-LABEL: widen_i16_constant_load_zext_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -113,7 +113,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load_sext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -130,7 +130,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar ; ; VI-LABEL: widen_i16_constant_load_sext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -145,7 +145,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar ; ; GFX11-LABEL: widen_i16_constant_load_sext_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -169,7 +169,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i17_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -193,7 +193,7 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_i17_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, 2 @@ -212,7 +212,7 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_i17_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -239,7 +239,7 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_f16_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -255,7 +255,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_f16_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -267,7 +267,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_f16_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -286,7 +286,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_v2i8_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -307,7 +307,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_v2i8_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -326,7 +326,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_v2i8_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -354,7 +354,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) %arg) { ; SI-LABEL: no_widen_i16_constant_divergent_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -373,7 +373,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) ; ; VI-LABEL: no_widen_i16_constant_divergent_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -390,7 +390,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) ; ; GFX11-LABEL: no_widen_i16_constant_divergent_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -417,7 +417,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i1_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -432,7 +432,7 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_i1_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -445,7 +445,7 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_i1_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -464,7 +464,7 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_zextload_i64_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -481,7 +481,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) ; ; VI-LABEL: widen_i16_zextload_i64_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -496,7 +496,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) ; ; GFX11-LABEL: widen_i16_zextload_i64_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -520,7 +520,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i1_zext_to_i64_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -538,7 +538,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; ; VI-LABEL: widen_i1_zext_to_i64_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -554,7 +554,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; ; GFX11-LABEL: widen_i1_zext_to_i64_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -576,7 +576,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; SI-LABEL: widen_i16_constant32_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -592,7 +592,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; ; VI-LABEL: widen_i16_constant32_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -607,7 +607,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; ; GFX11-LABEL: widen_i16_constant32_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -629,7 +629,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg) { ; SI-LABEL: widen_i16_global_invariant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -645,7 +645,7 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg ; ; VI-LABEL: widen_i16_global_invariant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -659,7 +659,7 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg ; ; GFX11-LABEL: widen_i16_global_invariant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index c5a9ab31ca529d..1ab82b04ba9459 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { ; ; GFX9-LABEL: workgroup_id_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -17,7 +17,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { ; ; GFX12-LABEL: workgroup_id_x: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] @@ -31,7 +31,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry) { ; GFX9-LABEL: workgroup_id_xy: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_and_b32 s4, ttmp7, 0xffff @@ -43,7 +43,7 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace ; ; GFX12-LABEL: workgroup_id_xy: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: s_and_b32 s4, ttmp7, 0xffff ; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v2, s4 @@ -63,14 +63,14 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) { ; GFX9-LABEL: workgroup_id_xyz: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_and_b32 s8, ttmp7, 0xffff +; GFX9-NEXT: s_and_b32 s6, ttmp7, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_lshr_b32 s0, ttmp7, 16 ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -80,18 +80,18 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; GFX12-LABEL: workgroup_id_xyz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX12-NEXT: s_and_b32 s2, ttmp7, 0xffff +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX12-NEXT: s_and_b32 s6, ttmp7, 0xffff ; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 -; GFX12-NEXT: s_lshr_b32 s3, ttmp7, 16 +; GFX12-NEXT: s_lshr_b32 s7, ttmp7, 16 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-NEXT: global_store_b32 v1, v2, s[6:7] -; GFX12-NEXT: global_store_b32 v1, v3, s[0:1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: global_store_b32 v1, v2, s[2:3] +; GFX12-NEXT: global_store_b32 v1, v3, s[4:5] ; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %idx, ptr addrspace(1) %ptrx diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 027081752a11bb..08cc2e4ec7d794 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -309,21 +309,21 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O0-NEXT: s_mov_b32 s26, -1 ; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O0-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O0-NEXT: s_add_u32 s24, s24, s11 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v3, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v3, s11, 1 -; GFX9-O0-NEXT: s_mov_b32 s14, s8 -; GFX9-O0-NEXT: s_mov_b32 s13, s7 -; GFX9-O0-NEXT: s_mov_b32 s12, s6 -; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 0 -; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 1 +; GFX9-O0-NEXT: v_writelane_b32 v3, s12, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s13, 1 +; GFX9-O0-NEXT: s_mov_b32 s14, s10 +; GFX9-O0-NEXT: s_mov_b32 s13, s9 +; GFX9-O0-NEXT: s_mov_b32 s12, s8 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-O0-NEXT: v_writelane_b32 v3, s4, 2 ; GFX9-O0-NEXT: v_writelane_b32 v3, s5, 3 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 0 +; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 1 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 2 ; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 3 @@ -331,36 +331,36 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s3, s7 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s7, s9 -; GFX9-O0-NEXT: s_mov_b32 s16, s8 +; GFX9-O0-NEXT: s_mov_b32 s3, s9 +; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_mov_b32 s9, s17 +; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s7 -; GFX9-O0-NEXT: s_mov_b32 s18, s6 +; GFX9-O0-NEXT: s_mov_b32 s17, s9 +; GFX9-O0-NEXT: s_mov_b32 s18, s8 ; GFX9-O0-NEXT: s_mov_b32 s19, s3 ; GFX9-O0-NEXT: v_writelane_b32 v3, s16, 4 ; GFX9-O0-NEXT: v_writelane_b32 v3, s17, 5 ; GFX9-O0-NEXT: v_writelane_b32 v3, s18, 6 ; GFX9-O0-NEXT: v_writelane_b32 v3, s19, 7 -; GFX9-O0-NEXT: s_mov_b32 s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v3, s6, 8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s8, 8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_writelane_b32 v3, s2, 9 ; GFX9-O0-NEXT: v_writelane_b32 v3, s3, 10 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s8 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 -; GFX9-O0-NEXT: s_mov_b32 s3, s6 -; GFX9-O0-NEXT: s_mov_b32 s1, s7 +; GFX9-O0-NEXT: s_mov_b32 s3, s8 +; GFX9-O0-NEXT: s_mov_b32 s1, s9 ; GFX9-O0-NEXT: s_add_u32 s8, s2, s3 ; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 @@ -370,12 +370,11 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: s_addc_u32 s17, s17, called@rel32@hi+12 ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O0-NEXT: s_mov_b32 s6, 20 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s6, v4 -; GFX9-O0-NEXT: s_mov_b32 s6, 10 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v5, s6, v5 +; GFX9-O0-NEXT: s_mov_b32 s15, 20 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s15, v4 +; GFX9-O0-NEXT: s_mov_b32 s15, 10 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v5, s15, v5 ; GFX9-O0-NEXT: v_or3_b32 v4, v6, v5, v4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 @@ -401,31 +400,32 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O3-NEXT: s_mov_b32 s26, -1 ; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O3-NEXT: s_add_u32 s24, s24, s11 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 ; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O3-NEXT: s_mov_b32 s14, s8 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-O3-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GFX9-O3-NEXT: s_mov_b32 s14, s10 +; GFX9-O3-NEXT: s_mov_b32 s13, s9 +; GFX9-O3-NEXT: s_mov_b32 s12, s8 +; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-O3-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 +; GFX9-O3-NEXT: s_add_u32 s8, s4, 56 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21] -; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0 ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: s_mov_b32 s12, s6 -; GFX9-O3-NEXT: s_mov_b32 s13, s7 ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 @@ -541,21 +541,21 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O0-NEXT: s_mov_b32 s26, -1 ; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O0-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O0-NEXT: s_add_u32 s24, s24, s11 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v8, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v8, s11, 1 -; GFX9-O0-NEXT: s_mov_b32 s14, s8 -; GFX9-O0-NEXT: s_mov_b32 s13, s7 -; GFX9-O0-NEXT: s_mov_b32 s12, s6 -; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 0 -; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 1 +; GFX9-O0-NEXT: v_writelane_b32 v8, s12, 0 +; GFX9-O0-NEXT: v_writelane_b32 v8, s13, 1 +; GFX9-O0-NEXT: s_mov_b32 s14, s10 +; GFX9-O0-NEXT: s_mov_b32 s13, s9 +; GFX9-O0-NEXT: s_mov_b32 s12, s8 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-O0-NEXT: v_writelane_b32 v8, s4, 2 ; GFX9-O0-NEXT: v_writelane_b32 v8, s5, 3 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 0 +; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 1 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 2 ; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 3 @@ -564,38 +564,38 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s6, s9 -; GFX9-O0-NEXT: s_mov_b32 s7, s8 -; GFX9-O0-NEXT: s_mov_b32 s8, s17 +; GFX9-O0-NEXT: s_mov_b32 s8, s19 +; GFX9-O0-NEXT: s_mov_b32 s9, s18 +; GFX9-O0-NEXT: s_mov_b32 s15, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s8 -; GFX9-O0-NEXT: s_mov_b32 s18, s7 -; GFX9-O0-NEXT: s_mov_b32 s19, s6 +; GFX9-O0-NEXT: s_mov_b32 s17, s15 +; GFX9-O0-NEXT: s_mov_b32 s18, s9 +; GFX9-O0-NEXT: s_mov_b32 s19, s8 ; GFX9-O0-NEXT: v_writelane_b32 v8, s16, 4 ; GFX9-O0-NEXT: v_writelane_b32 v8, s17, 5 ; GFX9-O0-NEXT: v_writelane_b32 v8, s18, 6 ; GFX9-O0-NEXT: v_writelane_b32 v8, s19, 7 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: s_mov_b32 s15, s7 -; GFX9-O0-NEXT: s_mov_b32 s8, s3 -; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-O0-NEXT: s_mov_b32 s15, s9 +; GFX9-O0-NEXT: s_mov_b32 s16, s3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9] -; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[16:17] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 ; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_writelane_b32 v8, s2, 8 ; GFX9-O0-NEXT: v_writelane_b32 v8, s3, 9 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s8 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] ; GFX9-O0-NEXT: ; implicit-def: $sgpr2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr2 @@ -604,11 +604,11 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_mov_b32 s2, 32 ; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s2, v[9:10] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 -; GFX9-O0-NEXT: s_mov_b32 s3, s6 -; GFX9-O0-NEXT: s_mov_b32 s1, s7 +; GFX9-O0-NEXT: s_mov_b32 s3, s8 +; GFX9-O0-NEXT: s_mov_b32 s1, s9 ; GFX9-O0-NEXT: s_add_u32 s8, s2, s3 ; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 @@ -619,12 +619,11 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O0-NEXT: s_mov_b32 s6, 20 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s6, v3 -; GFX9-O0-NEXT: s_mov_b32 s6, 10 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s6, v4 +; GFX9-O0-NEXT: s_mov_b32 s15, 20 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s15, v3 +; GFX9-O0-NEXT: s_mov_b32 s15, 10 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s15, v4 ; GFX9-O0-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 @@ -658,39 +657,40 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O3-NEXT: s_mov_b32 s26, -1 ; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O3-NEXT: s_add_u32 s24, s24, s11 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 ; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O3-NEXT: s_mov_b32 s14, s8 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-O3-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GFX9-O3-NEXT: s_mov_b32 s14, s10 +; GFX9-O3-NEXT: s_mov_b32 s13, s9 +; GFX9-O3-NEXT: s_mov_b32 s12, s8 +; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-O3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[8:9] ; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 -; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-O3-NEXT: s_getpc_b64 s[2:3] -; GFX9-O3-NEXT: s_add_u32 s2, s2, called_i64@gotpcrel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s3, s3, called_i64@gotpcrel32@hi+12 -; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0 +; GFX9-O3-NEXT: s_add_u32 s8, s4, 60 +; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-O3-NEXT: s_getpc_b64 s[4:5] +; GFX9-O3-NEXT: s_add_u32 s4, s4, called_i64@gotpcrel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s5, s5, called_i64@gotpcrel32@hi+12 +; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[20:21] ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: s_mov_b32 s12, s6 -; GFX9-O3-NEXT: s_mov_b32 s13, s7 ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 @@ -1161,21 +1161,21 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O0-NEXT: s_mov_b32 s26, -1 ; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O0-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O0-NEXT: s_add_u32 s24, s24, s11 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v3, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v3, s11, 1 -; GFX9-O0-NEXT: s_mov_b32 s14, s8 -; GFX9-O0-NEXT: s_mov_b32 s13, s7 -; GFX9-O0-NEXT: s_mov_b32 s12, s6 -; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 0 -; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 1 +; GFX9-O0-NEXT: v_writelane_b32 v3, s12, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s13, 1 +; GFX9-O0-NEXT: s_mov_b32 s14, s10 +; GFX9-O0-NEXT: s_mov_b32 s13, s9 +; GFX9-O0-NEXT: s_mov_b32 s12, s8 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-O0-NEXT: v_writelane_b32 v3, s4, 2 ; GFX9-O0-NEXT: v_writelane_b32 v3, s5, 3 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 0 +; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 1 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 2 ; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 3 @@ -1183,36 +1183,36 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s3, s7 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s7, s9 -; GFX9-O0-NEXT: s_mov_b32 s16, s8 +; GFX9-O0-NEXT: s_mov_b32 s3, s9 +; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_mov_b32 s9, s17 +; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s7 -; GFX9-O0-NEXT: s_mov_b32 s18, s6 +; GFX9-O0-NEXT: s_mov_b32 s17, s9 +; GFX9-O0-NEXT: s_mov_b32 s18, s8 ; GFX9-O0-NEXT: s_mov_b32 s19, s3 ; GFX9-O0-NEXT: v_writelane_b32 v3, s16, 4 ; GFX9-O0-NEXT: v_writelane_b32 v3, s17, 5 ; GFX9-O0-NEXT: v_writelane_b32 v3, s18, 6 ; GFX9-O0-NEXT: v_writelane_b32 v3, s19, 7 -; GFX9-O0-NEXT: s_mov_b32 s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v3, s6, 8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s8, 8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_writelane_b32 v3, s2, 9 ; GFX9-O0-NEXT: v_writelane_b32 v3, s3, 10 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s8 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 -; GFX9-O0-NEXT: s_mov_b32 s3, s6 -; GFX9-O0-NEXT: s_mov_b32 s1, s7 +; GFX9-O0-NEXT: s_mov_b32 s3, s8 +; GFX9-O0-NEXT: s_mov_b32 s1, s9 ; GFX9-O0-NEXT: s_add_u32 s8, s2, s3 ; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 @@ -1222,12 +1222,11 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: s_addc_u32 s17, s17, strict_wwm_called@rel32@hi+12 ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O0-NEXT: s_mov_b32 s6, 20 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s6, v4 -; GFX9-O0-NEXT: s_mov_b32 s6, 10 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v5, s6, v5 +; GFX9-O0-NEXT: s_mov_b32 s15, 20 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s15, v4 +; GFX9-O0-NEXT: s_mov_b32 s15, 10 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v5, s15, v5 ; GFX9-O0-NEXT: v_or3_b32 v4, v6, v5, v4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 @@ -1253,31 +1252,32 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O3-NEXT: s_mov_b32 s26, -1 ; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O3-NEXT: s_add_u32 s24, s24, s11 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 ; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O3-NEXT: s_mov_b32 s14, s8 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-O3-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GFX9-O3-NEXT: s_mov_b32 s14, s10 +; GFX9-O3-NEXT: s_mov_b32 s13, s9 +; GFX9-O3-NEXT: s_mov_b32 s12, s8 +; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-O3-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 +; GFX9-O3-NEXT: s_add_u32 s8, s4, 56 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21] -; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0 ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: s_mov_b32 s12, s6 -; GFX9-O3-NEXT: s_mov_b32 s13, s7 ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 @@ -1393,21 +1393,21 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O0-NEXT: s_mov_b32 s26, -1 ; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O0-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O0-NEXT: s_add_u32 s24, s24, s11 ; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v8, s10, 0 -; GFX9-O0-NEXT: v_writelane_b32 v8, s11, 1 -; GFX9-O0-NEXT: s_mov_b32 s14, s8 -; GFX9-O0-NEXT: s_mov_b32 s13, s7 -; GFX9-O0-NEXT: s_mov_b32 s12, s6 -; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 0 -; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 1 +; GFX9-O0-NEXT: v_writelane_b32 v8, s12, 0 +; GFX9-O0-NEXT: v_writelane_b32 v8, s13, 1 +; GFX9-O0-NEXT: s_mov_b32 s14, s10 +; GFX9-O0-NEXT: s_mov_b32 s13, s9 +; GFX9-O0-NEXT: s_mov_b32 s12, s8 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-O0-NEXT: v_writelane_b32 v8, s4, 2 ; GFX9-O0-NEXT: v_writelane_b32 v8, s5, 3 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 0 +; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 1 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 2 ; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 3 @@ -1416,38 +1416,38 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s6, s9 -; GFX9-O0-NEXT: s_mov_b32 s7, s8 -; GFX9-O0-NEXT: s_mov_b32 s8, s17 +; GFX9-O0-NEXT: s_mov_b32 s8, s19 +; GFX9-O0-NEXT: s_mov_b32 s9, s18 +; GFX9-O0-NEXT: s_mov_b32 s15, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s8 -; GFX9-O0-NEXT: s_mov_b32 s18, s7 -; GFX9-O0-NEXT: s_mov_b32 s19, s6 +; GFX9-O0-NEXT: s_mov_b32 s17, s15 +; GFX9-O0-NEXT: s_mov_b32 s18, s9 +; GFX9-O0-NEXT: s_mov_b32 s19, s8 ; GFX9-O0-NEXT: v_writelane_b32 v8, s16, 4 ; GFX9-O0-NEXT: v_writelane_b32 v8, s17, 5 ; GFX9-O0-NEXT: v_writelane_b32 v8, s18, 6 ; GFX9-O0-NEXT: v_writelane_b32 v8, s19, 7 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: s_mov_b32 s15, s7 -; GFX9-O0-NEXT: s_mov_b32 s8, s3 -; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-O0-NEXT: s_mov_b32 s15, s9 +; GFX9-O0-NEXT: s_mov_b32 s16, s3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9] -; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[16:17] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 ; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_writelane_b32 v8, s2, 8 ; GFX9-O0-NEXT: v_writelane_b32 v8, s3, 9 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s8 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] ; GFX9-O0-NEXT: ; implicit-def: $sgpr2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr2 @@ -1456,11 +1456,11 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_mov_b32 s2, 32 ; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s2, v[9:10] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 -; GFX9-O0-NEXT: s_mov_b32 s3, s6 -; GFX9-O0-NEXT: s_mov_b32 s1, s7 +; GFX9-O0-NEXT: s_mov_b32 s3, s8 +; GFX9-O0-NEXT: s_mov_b32 s1, s9 ; GFX9-O0-NEXT: s_add_u32 s8, s2, s3 ; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 @@ -1471,12 +1471,11 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O0-NEXT: s_mov_b32 s6, 20 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s6, v3 -; GFX9-O0-NEXT: s_mov_b32 s6, 10 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s6, v4 +; GFX9-O0-NEXT: s_mov_b32 s15, 20 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s15, v3 +; GFX9-O0-NEXT: s_mov_b32 s15, 10 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s15, v4 ; GFX9-O0-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 @@ -1510,39 +1509,40 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; GFX9-O3-NEXT: s_mov_b32 s26, -1 ; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O3-NEXT: s_add_u32 s24, s24, s11 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 ; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O3-NEXT: s_mov_b32 s14, s8 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-O3-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GFX9-O3-NEXT: s_mov_b32 s14, s10 +; GFX9-O3-NEXT: s_mov_b32 s13, s9 +; GFX9-O3-NEXT: s_mov_b32 s12, s8 +; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-O3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[8:9] ; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 -; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-O3-NEXT: s_getpc_b64 s[2:3] -; GFX9-O3-NEXT: s_add_u32 s2, s2, strict_wwm_called_i64@gotpcrel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s3, s3, strict_wwm_called_i64@gotpcrel32@hi+12 -; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0 +; GFX9-O3-NEXT: s_add_u32 s8, s4, 60 +; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-O3-NEXT: s_getpc_b64 s[4:5] +; GFX9-O3-NEXT: s_add_u32 s4, s4, strict_wwm_called_i64@gotpcrel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s5, s5, strict_wwm_called_i64@gotpcrel32@hi+12 +; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[20:21] ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: s_mov_b32 s12, s6 -; GFX9-O3-NEXT: s_mov_b32 s13, s7 ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index 9fac17f33d0d36..8c9dac781d5da1 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -5,40 +5,40 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v1, v3, v1 ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v1, v1, v3 ; VI-NEXT: v_xor_b32_e32 v0, v0, v2 @@ -54,42 +54,42 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v3, v7, v3 ; SI-NEXT: v_xor_b32_e32 v2, v6, v2 ; SI-NEXT: v_xor_b32_e32 v1, v5, v1 ; SI-NEXT: v_xor_b32_e32 v0, v4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: v_mov_b32_e32 v8, s0 +; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v3, v3, v7 ; VI-NEXT: v_xor_b32_e32 v2, v2, v6 @@ -107,43 +107,43 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 ; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v4 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -165,43 +165,43 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: v_xor_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_xor_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 -; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_xor_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_load_ubyte v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_ubyte v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_xor_b32_e32 v2, v4, v2 ; VI-NEXT: v_and_b32_e32 v2, 1, v2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -216,39 +216,39 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0 define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_xor_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: vector_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v2, v4, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -263,7 +263,7 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: scalar_xor_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -276,7 +276,7 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: scalar_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -292,22 +292,22 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: scalar_not_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_not_b32 s4, s4 +; SI-NEXT: s_not_b32 s4, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_not_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_not_b32 s2, s4 +; VI-NEXT: s_not_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -321,7 +321,7 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_not_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -339,7 +339,7 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: vector_not_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -360,40 +360,40 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_xor_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 ; SI-NEXT: v_xor_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: vector_xor_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_xor_b32_e32 v0, v0, v2 ; VI-NEXT: v_xor_b32_e32 v1, v1, v3 @@ -409,28 +409,28 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: scalar_xor_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_xor_b64 s[4:5], s[6:7], s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_xor_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -442,7 +442,7 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; SI-LABEL: scalar_not_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -456,7 +456,7 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: scalar_not_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -473,7 +473,7 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_not_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -492,7 +492,7 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: vector_not_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -514,7 +514,7 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) { ; SI-LABEL: xor_cf: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -545,7 +545,7 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; ; VI-LABEL: xor_cf: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -591,22 +591,22 @@ endif: define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_literal_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b32 s5, s5, 0xf237b -; SI-NEXT: s_xor_b32 s4, s4, 0x3039 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_xor_b32 s4, s7, 0xf237b +; SI-NEXT: s_xor_b32 s5, s6, 0x3039 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_xor_literal_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s1, s1, 0xf237b ; VI-NEXT: s_xor_b32 s0, s0, 0x3039 @@ -624,8 +624,8 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) { ; SI-LABEL: scalar_xor_literal_multi_use_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x13 ; SI-NEXT: s_movk_i32 s8, 0x3039 ; SI-NEXT: s_mov_b32 s9, 0xf237b ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -646,19 +646,19 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou ; ; VI-LABEL: scalar_xor_literal_multi_use_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_movk_i32 s2, 0x3039 -; VI-NEXT: s_mov_b32 s3, 0xf237b +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_movk_i32 s6, 0x3039 +; VI-NEXT: s_mov_b32 s7, 0xf237b ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_add_u32 s0, s6, 0x3039 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s2, 0x3039 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_addc_u32 s1, s7, 0xf237b +; VI-NEXT: s_addc_u32 s1, s3, 0xf237b ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -675,21 +675,21 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_inline_imm_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b32 s4, s4, 63 +; SI-NEXT: s_xor_b32 s4, s6, 63 ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_xor_inline_imm_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s0, s0, 63 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -706,12 +706,12 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_neg_inline_imm_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -8 +; SI-NEXT: s_xor_b64 s[4:5], s[6:7], -8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -719,8 +719,8 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; ; VI-LABEL: scalar_xor_neg_inline_imm_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], -8 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -737,7 +737,7 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: vector_xor_i64_neg_inline_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -756,7 +756,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ; ; VI-LABEL: vector_xor_i64_neg_inline_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -777,7 +777,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: vector_xor_literal_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -796,7 +796,7 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: vector_xor_literal_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll index 28da8ac423107c..45cb7955b612d5 100644 --- a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: zext_i16_to_i32_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -25,15 +25,15 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @zext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: zext_i16_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, 0 +; GCN-NEXT: s_and_b32 s4, s8, 0xffff +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -47,12 +47,12 @@ define amdgpu_kernel void @zext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @zext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: zext_i16_to_i32_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -68,13 +68,13 @@ define amdgpu_kernel void @zext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 define amdgpu_kernel void @zext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: zext_i16_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index b69ede6f24f0f1..077d22fc895ae5 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -28,12 +28,13 @@ ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; CHECK-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } -; CHECK-NEXT: dispatchID: { reg: '$sgpr8_sgpr9' } -; CHECK-NEXT: workGroupIDX: { reg: '$sgpr10' } -; CHECK-NEXT: workGroupIDY: { reg: '$sgpr11' } -; CHECK-NEXT: workGroupIDZ: { reg: '$sgpr12' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr13' } +; CHECK-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr8_sgpr9' } +; CHECK-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr12' } +; CHECK-NEXT: workGroupIDY: { reg: '$sgpr13' } +; CHECK-NEXT: workGroupIDZ: { reg: '$sgpr14' } +; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr15' } ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } ; CHECK-NEXT: workItemIDY: { reg: '$vgpr1' } ; CHECK-NEXT: workItemIDZ: { reg: '$vgpr2' } diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index e2dada85ef8729..6633cec659d8e5 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -4,17 +4,17 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-LABEL: InferNothing: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CHECK-NEXT: s_add_u32 s0, s0, s4 -; CHECK-NEXT: s_addc_u32 s1, s1, s5 +; CHECK-NEXT: s_ashr_i32 s7, s6, 31 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 +; CHECK-NEXT: s_add_u32 s0, s2, s0 +; CHECK-NEXT: s_addc_u32 s1, s3, s1 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0 -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -35,20 +35,20 @@ define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, doub ; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: s_cbranch_execz .LBB1_2 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: s_load_dword s8, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s9, s8, 31 -; CHECK-NEXT: s_lshl_b64 s[2:3], s[8:9], 3 -; CHECK-NEXT: s_add_u32 s2, s4, s2 -; CHECK-NEXT: s_addc_u32 s3, s5, s3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; CHECK-NEXT: s_add_u32 s2, s8, s2 +; CHECK-NEXT: s_addc_u32 s3, s9, s3 ; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1] +; CHECK-NEXT: v_mul_f64 v[0:1], s[10:11], v[0:1] ; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] offset:-8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol @@ -66,33 +66,33 @@ entry: define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, double %c, ptr %d) { ; CHECK-LABEL: InferMixed: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; CHECK-NEXT: s_mov_b64 s[6:7], exec ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s8 ; CHECK-NEXT: v_mov_b32_e32 v1, s9 -; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc ; CHECK-NEXT: s_cbranch_execz .LBB2_2 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: s_load_dword s2, s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s3, s2, 31 -; CHECK-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 -; CHECK-NEXT: s_add_u32 s2, s4, s2 -; CHECK-NEXT: s_addc_u32 s3, s5, s3 -; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1] -; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] offset:-7 +; CHECK-NEXT: s_ashr_i32 s5, s4, 31 +; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; CHECK-NEXT: s_add_u32 s0, s0, s4 +; CHECK-NEXT: s_addc_u32 s1, s1, s5 +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[6:7] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 +; CHECK-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: .LBB2_2: @@ -116,15 +116,15 @@ bb1: ; preds = %entry define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferPHI: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CHECK-NEXT: s_add_u32 s0, s4, s0 -; CHECK-NEXT: s_addc_u32 s1, s5, s1 -; CHECK-NEXT: s_add_u32 s2, s0, -8 -; CHECK-NEXT: s_addc_u32 s3, s1, -1 +; CHECK-NEXT: s_ashr_i32 s7, s6, 31 +; CHECK-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; CHECK-NEXT: s_add_u32 s0, s0, s4 +; CHECK-NEXT: s_addc_u32 s1, s1, s5 +; CHECK-NEXT: s_add_u32 s4, s0, -8 +; CHECK-NEXT: s_addc_u32 s5, s1, -1 ; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 9 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -138,14 +138,14 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: s_cbranch_execz .LBB3_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1] +; CHECK-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: .LBB3_4: diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected index bdba2436346895..2202b6446fd15c 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -5,8 +5,8 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-LABEL: i64_test: ; CHECK: SelectionDAG has 25 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 -; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 +; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %9 ; CHECK-NEXT: t49: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> ; CHECK-NEXT: t26: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t29: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 @@ -28,7 +28,7 @@ define i64 @i32_test(i32 %i) nounwind readnone { ; CHECK-LABEL: i32_test: ; CHECK: SelectionDAG has 15 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 ; CHECK-NEXT: t6: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t7: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t6, TargetConstant:i1<0> ; CHECK-NEXT: t14: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t7 @@ -47,7 +47,7 @@ define i64 @i16_test(i16 %i) nounwind readnone { ; CHECK-LABEL: i16_test: ; CHECK: SelectionDAG has 18 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 ; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_USHORT_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0> ; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<65535> @@ -68,7 +68,7 @@ define i64 @i8_test(i8 %i) nounwind readnone { ; CHECK-LABEL: i8_test: ; CHECK: SelectionDAG has 18 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 ; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_UBYTE_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0> ; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<255>